{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import string, re\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sn\n",
    "import jieba\n",
    "import operator\n",
    "import zhconv\n",
    "from collections import Counter\n",
    "import warnings\n",
    "warnings.filterwarnings(action='ignore',category=UserWarning,module='gensim')\n",
    "\n",
    "from gensim.models import Word2Vec\n",
    "\n",
    "from tensorflow.keras.preprocessing import sequence\n",
    "from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation\n",
    "from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D\n",
    "from tensorflow.keras.models import Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "comments_path = r'datasets/movie_comments.csv'\n",
    "model_path = r'models/zhwiki.50d.word2vec'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 影评数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>link</th>\n",
       "      <th>name</th>\n",
       "      <th>comment</th>\n",
       "      <th>star</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>https://movie.douban.com/subject/26363254/</td>\n",
       "      <td>战狼2</td>\n",
       "      <td>吴京意淫到了脑残的地步，看了恶心想吐</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>https://movie.douban.com/subject/26363254/</td>\n",
       "      <td>战狼2</td>\n",
       "      <td>首映礼看的。太恐怖了这个电影，不讲道理的，完全就是吴京在实现他这个小粉红的英雄梦。各种装备轮...</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>https://movie.douban.com/subject/26363254/</td>\n",
       "      <td>战狼2</td>\n",
       "      <td>吴京的炒作水平不输冯小刚，但小刚至少不会用主旋律来炒作…吴京让人看了不舒服，为了主旋律而主旋...</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>https://movie.douban.com/subject/26363254/</td>\n",
       "      <td>战狼2</td>\n",
       "      <td>凭良心说，好看到不像《战狼1》的续集，完虐《湄公河行动》。</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>https://movie.douban.com/subject/26363254/</td>\n",
       "      <td>战狼2</td>\n",
       "      <td>中二得很</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>https://movie.douban.com/subject/26363254/</td>\n",
       "      <td>战狼2</td>\n",
       "      <td>“犯我中华者，虽远必诛”，吴京比这句话还要意淫一百倍。</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>6</td>\n",
       "      <td>7</td>\n",
       "      <td>https://movie.douban.com/subject/26363254/</td>\n",
       "      <td>战狼2</td>\n",
       "      <td>脑子是个好东西，希望编剧们都能有。</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>7</td>\n",
       "      <td>8</td>\n",
       "      <td>https://movie.douban.com/subject/26363254/</td>\n",
       "      <td>战狼2</td>\n",
       "      <td>三星半，实打实的7分。第一集在爱国主旋律内部做着各种置换与较劲，但第二集才真正显露吴京的野心...</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>8</td>\n",
       "      <td>9</td>\n",
       "      <td>https://movie.douban.com/subject/26363254/</td>\n",
       "      <td>战狼2</td>\n",
       "      <td>开篇长镜头惊险大气引人入胜 结合了水平不俗的快剪下实打实的真刀真枪 让人不禁热血沸腾 特别弹...</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>9</td>\n",
       "      <td>10</td>\n",
       "      <td>https://movie.douban.com/subject/26363254/</td>\n",
       "      <td>战狼2</td>\n",
       "      <td>15/100吴京的冷峰在这部里即像成龙，又像杰森斯坦森，但体制外的同类型电影，主角总是代表个...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id                                        link name  \\\n",
       "0   1  https://movie.douban.com/subject/26363254/  战狼2   \n",
       "1   2  https://movie.douban.com/subject/26363254/  战狼2   \n",
       "2   3  https://movie.douban.com/subject/26363254/  战狼2   \n",
       "3   4  https://movie.douban.com/subject/26363254/  战狼2   \n",
       "4   5  https://movie.douban.com/subject/26363254/  战狼2   \n",
       "5   6  https://movie.douban.com/subject/26363254/  战狼2   \n",
       "6   7  https://movie.douban.com/subject/26363254/  战狼2   \n",
       "7   8  https://movie.douban.com/subject/26363254/  战狼2   \n",
       "8   9  https://movie.douban.com/subject/26363254/  战狼2   \n",
       "9  10  https://movie.douban.com/subject/26363254/  战狼2   \n",
       "\n",
       "                                             comment star  \n",
       "0                                 吴京意淫到了脑残的地步，看了恶心想吐    1  \n",
       "1  首映礼看的。太恐怖了这个电影，不讲道理的，完全就是吴京在实现他这个小粉红的英雄梦。各种装备轮...    2  \n",
       "2  吴京的炒作水平不输冯小刚，但小刚至少不会用主旋律来炒作…吴京让人看了不舒服，为了主旋律而主旋...    2  \n",
       "3                      凭良心说，好看到不像《战狼1》的续集，完虐《湄公河行动》。    4  \n",
       "4                                               中二得很    1  \n",
       "5                        “犯我中华者，虽远必诛”，吴京比这句话还要意淫一百倍。    1  \n",
       "6                                  脑子是个好东西，希望编剧们都能有。    2  \n",
       "7  三星半，实打实的7分。第一集在爱国主旋律内部做着各种置换与较劲，但第二集才真正显露吴京的野心...    4  \n",
       "8  开篇长镜头惊险大气引人入胜 结合了水平不俗的快剪下实打实的真刀真枪 让人不禁热血沸腾 特别弹...    4  \n",
       "9  15/100吴京的冷峰在这部里即像成龙，又像杰森斯坦森，但体制外的同类型电影，主角总是代表个...    1  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "comments = pd.read_csv(comments_path, low_memory=False)\n",
    "comments.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 261497 entries, 0 to 261496\n",
      "Data columns (total 2 columns):\n",
      "comment    261495 non-null object\n",
      "star       261497 non-null object\n",
      "dtypes: object(2)\n",
      "memory usage: 4.0+ MB\n",
      "None\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>comment</th>\n",
       "      <th>star</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>吴京意淫到了脑残的地步，看了恶心想吐</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>首映礼看的。太恐怖了这个电影，不讲道理的，完全就是吴京在实现他这个小粉红的英雄梦。各种装备轮...</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>吴京的炒作水平不输冯小刚，但小刚至少不会用主旋律来炒作…吴京让人看了不舒服，为了主旋律而主旋...</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>凭良心说，好看到不像《战狼1》的续集，完虐《湄公河行动》。</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>中二得很</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>“犯我中华者，虽远必诛”，吴京比这句话还要意淫一百倍。</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>6</td>\n",
       "      <td>脑子是个好东西，希望编剧们都能有。</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>7</td>\n",
       "      <td>三星半，实打实的7分。第一集在爱国主旋律内部做着各种置换与较劲，但第二集才真正显露吴京的野心...</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>8</td>\n",
       "      <td>开篇长镜头惊险大气引人入胜 结合了水平不俗的快剪下实打实的真刀真枪 让人不禁热血沸腾 特别弹...</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>9</td>\n",
       "      <td>15/100吴京的冷峰在这部里即像成龙，又像杰森斯坦森，但体制外的同类型电影，主角总是代表个...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                             comment star\n",
       "0                                 吴京意淫到了脑残的地步，看了恶心想吐    1\n",
       "1  首映礼看的。太恐怖了这个电影，不讲道理的，完全就是吴京在实现他这个小粉红的英雄梦。各种装备轮...    2\n",
       "2  吴京的炒作水平不输冯小刚，但小刚至少不会用主旋律来炒作…吴京让人看了不舒服，为了主旋律而主旋...    2\n",
       "3                      凭良心说，好看到不像《战狼1》的续集，完虐《湄公河行动》。    4\n",
       "4                                               中二得很    1\n",
       "5                        “犯我中华者，虽远必诛”，吴京比这句话还要意淫一百倍。    1\n",
       "6                                  脑子是个好东西，希望编剧们都能有。    2\n",
       "7  三星半，实打实的7分。第一集在爱国主旋律内部做着各种置换与较劲，但第二集才真正显露吴京的野心...    4\n",
       "8  开篇长镜头惊险大气引人入胜 结合了水平不俗的快剪下实打实的真刀真枪 让人不禁热血沸腾 特别弹...    4\n",
       "9  15/100吴京的冷峰在这部里即像成龙，又像杰森斯坦森，但体制外的同类型电影，主角总是代表个...    1"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "comments.drop(['id','link','name'], axis=1, inplace=True)\n",
    "print(comments.info())\n",
    "comments.head(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 删除评论为空的行"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 261495 entries, 0 to 261496\n",
      "Data columns (total 2 columns):\n",
      "comment    261495 non-null object\n",
      "star       261495 non-null object\n",
      "dtypes: object(2)\n",
      "memory usage: 6.0+ MB\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "comments = comments.dropna()\n",
    "print(comments.info())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 删除评分不为数值的行"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 261494 entries, 0 to 261496\n",
      "Data columns (total 2 columns):\n",
      "comment    261494 non-null object\n",
      "star       261494 non-null object\n",
      "dtypes: object(2)\n",
      "memory usage: 6.0+ MB\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "comments = comments[comments['star'].isin(['1', '2', '3', '4', '5'])]\n",
    "print(comments.info())\n",
    "comments['star'] = comments['star'].astype('int')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 查看评分分数分布:\n",
    "- 不同类评分的分布比较不均匀，特别是 1、2 分\n",
    "- **todo：处理这种不平衡**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAEGCAYAAACJnEVTAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAdPklEQVR4nO3dfXyU5Z3v8c8PDA8GQxBS0iAUcW0XtzxIZz2kBzGy0BKw1BcHtR7XWK1FUdweOYhaqLgGra92Bd0oT5bW1mKpUKBUCgpHxRwe2obgCqtVUKAFZYEDhILUh/R3/rjvwBBCMg9kJnp/36/XvJz7N9fMXBfCd6655n4wd0dERKKhVbY7ICIimaPQFxGJEIW+iEiEKPRFRCJEoS8iEiEKfZFmYGbWQO2sbPRFJJ5CXz6RzKyLmbUK7/c0s8+H98+qH64WaHua1/n78L+9zGzIadqMN7N2ZvaimfUxs7vMLM/M5pjZ4NN0caCZrapXW2NmA5oYV5P9qde+lZmtMbPPNdVWBBT60kKZ2Qwz+08zezm8vWpmT4aPdQJeBq4Nm/cCVppZD+BbQJWZbTSzg2b2JrAR2Ghm59R7j5HAonBW7sBcMzu7ge6cBUwBPgZygW+4+2FgCLDrNEPoA6yNe698oAuwqZExJ9qfeKXA2e6+s4l2IkDwl1mkJfobwaSk7u9oa+BDM/s7YCbwHXf/PwDu/qKZfQ/4R+BJ4JfufsjMlgKPAmuAPu7+l7oXD78lTAUme3CE4nYz+23Yfmxcu3bAuwQh3AkoAV4JZ/gd3P2dsF1bd/8gvP+bsC/HzOxKYBqQB+QDW8OVn0J375BCf4qBXwBHCD6EPg+8Z2avxv3ZWfjndru7v5z4H7lEgemIXGmJzGwQcCEQC0t/AP4MtAGqgUPA94B/BT4Og5IwZMvcfXRc6H8ELAA+7+7HwnaTgGHuPizuPTsAG4DngYnu7maWBzwEXEIQ5E8DB4CuBDP9PwM9gL8A/d39L2b2OvDl8INnGvA28E1grLu/Gb7X2+5+Qdx7J9Sfen9G1wI3ufswM/sR8C/u/n4Kf9wSIVrekRbHzG4DpgM3AP8EDCMIzQeBi939v4AZwH8RhP4fzeyjMCR/DfStWxsP3Q48FBf4/wT8L+Jm0ADufoQgyIcSzOa/ABwD3gF2A5UEHzYHgQuAx909BrxIsORT903ib/WGdFnw8kHg12+TZH/qntMTKI97zuUE30ZEGqXlHWlx3H2mmb0BfKneQ1vcfaWZ3QG0cfeKsP5dM9sBfEAwkbkB2Bf3vOeAxWbWGjgbmEvwLWG9mbUnWA45HLY9lyCA/xGoBc4D9gOTgAcIlmCuBHLi+teDYDZ/Or8HfmRm89z9W2HtbwDh7wzJ9Acz+yywHPgcsCRcLjoP2GBmHvbnendf3kifJKK0vCMtkpk9ArQFqsLS3wPnECyt9AXGEATi3nAZZgfwd8BvgQKCgLwA2Euw9NIGWOju5WbWxt0/DN/nB8Bud38s3F5J8K3glXD7S8CPCNbjWxEs53wEjCD4MXk48Iq794nr+xaCWXctUAjc4+5PmdkGYKq7P29mb7r7F8L2yfSnH7AYeAyY5O7nhfVtwBfd/a9m9hQw393r7z0kouUdabFqCZYsxgLjCPZSMeBx4H8QhPgqgtCt0xmY5u4Xh8sua4Bb3D3m7n3dvRygLmBDlwG/i9vuRtweOe6+ERgE1BD8aPoD4M1wqWgxsJRgeae+S929P8EHRp1HgLLwx+Ejce+RcH/CfnzX3f+9gfeMp9mcNEihLy3V94HrCUL2DeAuYEK4nt+eYHljpbuvCNufBawAPpPoG5jZ14HW7r4hrvxZgvX7eEUE3zhWAnMIdgGFIPQHA79J8C2XADcBHTmxfJNUf9x9h7v/Mtw85d+vmbUh+PD7OME+ScQo9KXFCffDX0SwpNOKYP37i8D/NrP+wDpgubtPCtt/gWCN/RZ3XxT/UuGtoff4OsGun+PC7Twz+zLw17pdL+NsI9j7py0wGfgHMxsN/BKYSLA//aVx7XOAynA3ypvDbcJxGMG3k21p9KdO/AFnZ4Xv8ybBmv5rp3mORJ2766Zbi7oBvQn2WDkf2APcHNavJQjLIQ08p2MDtZXA8AbqjwH/CXwprjYJ2EKwu2d827OBlwj2FioMa/8MrCbYBRSCffdXAbnh9s1A2/B+v7h2nyX4djAJyEulP/X61jbu/m6C4whysv3/T7eWfdMPudKimVkHD3ZdrDtQqrW7H03zNc8hmEF/dCb6mK6W1h/5dFPoi4hEiNb0RUQiRKEvIhIhLfqI3C5dunjPnj2z3Q0RkU+UjRs37nf3goYea9Gh37NnT6qqqppuKCIix5nZaU+1reUdETlu9uzZFBYWEovF2L59e6Nt58+fz9ChQ7nqqqvYunUrAB9++CFXX3017dq1o2fPnqxevToT3ZYkKPRFBIDNmzdTXl5OdXU1FRUVjB8//rRt165dy4wZM6ioqGDw4MGUlZUB8Mgjj5CTk8OePXt44IEHGDduXKa6LwlS6IsIAEuXLqWsrIyioiKKi4vZv38/R482fEhETU0Nc+bMoXfv3tx444289dZbAPTr14+ZM2eSn5/PFVdcwa5dp7uwmGRLi17TF5HM2bVrFyUlJce3i4qK2LlzJxdddNEpbUeMGAHAsWPHKC8vZ8yYMSfVAVavXs3AgQObt9OSNIW+iABQW1tLXl7e8e3c3FwOHTrU6HPKysp47rnnWLt27Un1Dz74gKlTpzJjxoxm6aukTss7IgJAp06dTgr5Y8eO0apV4xGxcOFCli1bxpgxY4g/uv++++6jf//+DB8+vJFnSzYo9EUEgFgsxvr164HgRIzV1dV069atwbaVlZXs3bsXgGHDhnHgwAHefz+4PO+KFStYuHAhs2bNykzHJSkKfREBoLS0lCVLlrB48WLKy8vp3Lkz3bt3b7DtSy+9xJ133kltbS2rVq2iW7du5Obmsm3bNsrKyliwYAH5+fkZHoEkQqEvIgDk5eWxaNEipk+fzpo1a5g/fz7r1q1j5MiRp7SdNGkStbW1FBUVMXnyZBYsWADAE088QU1NDaNGjaKwsJDCwkLee++9TA9FGtGiz7IZi8VcR+SKJKfnPZ+O66HvePjUDxtJjJlt9OCSoafQTF9EJEIU+iIiEaLQFxGJEIW+iEiEKPRFRCKkydA3s05m9lszqzKzOWFtnpmtN7Mpce1SromISGYkMtO/Hpgf7v5zjplNAlq7ezHQy8wuNLPRqdaaaVwiItKARE649v+AL5pZPtAdqAGeDR97ARgEXJxGbWt6QxARkUQlMtP/v8DngH8B3gDaALvDxw4AXYHcNGonMbOx4VJS1b59+5Idj4iINCKR0J8K3OruDwB/BP4n0D58rEP4GkfSqJ3E3ee6e8zdYwUFDV7XV0REUpRI6HcC+phZa+C/AQ8TLMsA9AN2ABvTqImISIYksqb/feAnBEs864EZQKWZFQGlwEDA06iJiEiGNDnTd/ffu/s/uHsHdx/m7oeBEmADcLm716RTa45BiYhIw1K6XKK7H+TEXjhp10REJDN0RK6ISIQo9EVEIkShLyISIQp9EZEIUeiLiESIQl9EJEIU+iIiEaLQFxGJEIW+iEiEKPRFRCJEoS8iEiEKfRGRCFHoi4hEiEJfRCRCFPoiIhGi0BcRiZAmL6JiZuOAa8LNfOB34fMuApa7+7Sw3bxUayIikhmJXC5xlruXuHsJUAm8DbR292Kgl5ldaGajU60128hEROQUCV8u0cy6AV0JLm5ed7nDF4BBwMVp1Lam3n0REUlGMmv6twOzgFxgd1g7QPBBkE7tJGY21syqzKxq3759SXRPRESaklDom1kr4HLgZeAI0D58qEP4GunUTuLuc9095u6xgoKCJIcjIiKNSXSmfynwO3d3YCPBsgxAP2BHmjWRFmP27NkUFhYSi8XYvn37adt9+OGHXH311bRr146ePXuyevVqAPbs2UO7du0oLCyksLCQq6++OlNdF0lIomv6XwVeCe8vBSrNrAgoBQYSrPOnWhNpETZv3kx5eTnV1dXs3LmT8ePHs3z58gbbPvLII+Tk5LBnzx6WLVvGuHHj2Lp1K6+++io33HADc+bMyXDvRRKT0Ezf3b/r7ovD+4eBEmADcLm716RTO7PDEUnd0qVLKSsro6ioiOLiYvbv38/Ro0cbbNuvXz9mzpxJfn4+V1xxBbt27QJg06ZN9O/fP5PdFklKSgdnuftBd3/W3feciZpIS7Br1y769u17fLuoqIidO3c22HbEiBF07NgRgNWrVzNwYPClddOmTfzwhz/kM5/5DAMHDuStt95q/o6LJEFH5IqEamtrycvLO76dm5vLoUOHGn3OBx98wNSpU7n77rsB6NWrF4sWLWLv3r2MGTOGCRMmNGufRZKl0BcJderU6aSQP3bsGK1aNf5P5L777qN///4MHz4cgIcffpgBAwYAcNttt/Hiiy82X4dFUqDQFwnFYjHWr18PgLtTXV1Nt27dTtt+xYoVLFy4kFmzZgHBHj0/+9nPjj++e/duWrduTbDTm0jLkPARuSKfdqWlpUyYMIEhQ4awZcsWOnfuTPfu3Rtsu23bNsrKyli+fDn5+fkAtGnThoceeogePXrQv39/ysvLGTVqFGaWyWGINEozfZFQXl4eixYtYvr06axZs4b58+ezbt06Ro4ceUrbJ554gpqaGkaNGnV8n/z33nuPH//4x3z729/mggsuoLa2loqKiiyMROT0rCV/9YzFYl5VVZXtbsgnTM97Gt63/pNox8OnfuA05dMy/lTGLgEz2+jusYYe00xfRCRCFPoiIhGi0BcRiRCFvohIhCj0RUQiRKEvIkL6p9UGmD9/PkOHDuWqq65i69aWeVFAhb6IRF78abUrKioYP378advGn1b7gQceYNy4cQCsXbuWGTNmUFFRweDBgykrK8tU95Oi0BeRyDsTp9Wuqalhzpw59O7dmxtvvLHFnmFVp2EQkcjbtWsXJSUlx7frTqt90UUXndJ2xIgRx+/Hn1a7rn7s2DHKy8sZM2ZM83Y6RQp9EYm8dE6rPWPGjJPqZWVlPPfcc6xdu7ZZ+pquhJd3zGymmX0tvD/PzNab2ZS4x1OuiYhk05k4rXadhQsXsmzZMsaMGdMiz7CaUOib2aVAobv/xsxGA63dvRjoZWYXplNrpnGJiCQs3dNqA1RWVrJ3714Ahg0bxoEDB3j//febt+MpaDL0zSwHeBLYYWZfJ7jG7bPhwy8Ag9KsiYhkVWlpKUuWLGHx4sWUl5cndFrtBQsWHD+tNsBLL73EnXfeSW1tLatWraJbt27k5uZmaggJS2SmXwa8DvwAuAS4HdgdPnYA6ArkplE7iZmNNbMqM6vat29fsuMREUnamTit9qRJk6itraWoqIjJkyezYMGCLIykaYn8kHsxMNfd95jZz4EvA+3DxzoQfHAcSaN2EnefC8yF4NTKSY5HRCIs7dNKD7oXgK/+ZFuw3ee2U1+z7VCKJgw9qVT8WHXYgetpf+P17AW+Nv9PwJ9S7kpznVo6kZn+NqBXeD8G9OTEskw/YAewMY2aiIhkSCIz/XnAj83sG0AOwbr8MjMrAkqBgYADlSnWREQkQ5qc6bv7X9z9Kncf7O7F7r6TIPg3AJe7e427H0611hyDEhGRhqV0cJa7H+TEXjhp10REJDN07h0RkQhR6IuIRIhCX0QkQhT6IiIRotAXEYkQhb6ISIQo9EVEIkShLyISIQp9EZEIUeiLiESIQl9EJEIU+iIiEaLQFxGJEIW+iEiEKPRFRCJEoS8iEiGNhr6ZnWVmfzKzl8NbHzP7VzP7g5k9Edcu5ZqIiGROUzP9vsAv3L3E3UuANgQXNr8E2GtmQ83sS6nWmmdIIiJyOk1dLnEgcIWZXQ5sBt4EfuXubmbPE1zcvCaN2ur6b2hmY4GxAD169DgjgxQRkUBTM/0/AEPd/RIgB2gP7A4fOwB0BXLTqJ3C3ee6e8zdYwUFBUkPSERETq+pmf5r7v5BeL+KE8EP0IHgQ+NIGjUREcmgpoL3aTPrZ2atgSsJZuuDwsf6ATuAjWnUREQkg5qa6T8APAMYsAyYBlSa2WPA8PC2E/h+ijUREcmgRmf67r7F3fu6ex93n+zufwOGApVAqbtvT6fWnAMTEZFTNTXTP4W7HwMWnamaiIhkjn5MFRGJEIW+iEiEKPRFRCJEoS8iEiEKfRGRCFHoi4hEiEJfRCRCFPoiIhGi0BcRiRCFvohIhCj0RUQiRKEvIhIhCn0RkQhR6IuIRIhCX0QkQhIKfTPramabwvvzzGy9mU2JezzlmoiIZE6iM/1/A9qb2WigtbsXA73M7MJ0as0xIBEROb0mr5xlZkOAo8AeoAR4NnzoBYILnV+cRm1rugMQEZHENTrTN7M2wPeAe8JSLrA7vH8A6JpmraH3HGtmVWZWtW/fvmTHIyIijWhqeeceYKa7Hwq3jwDtw/sdwuenUzuFu89195i7xwoKCpIbjYiINKqp0B8K3G5mLwP9ga8RLMsA9AN2ABvTqImISAY1uqbv7oPr7ofBPwqoNLMioBQYCHgaNRERyaCE99N39xJ3P0zwY+4G4HJ3r0mndiYHIiIiTWty75363P0gJ/bCSbsmIiKZoyNyRUQiRKEvIhIhCn0RkQhR6IuIRIhCX0QkQhT6IiIRotAXEYkQhb6ISIQo9EVEIkShLyISIQp9EZEIUeiLiESIQl9EJEIU+iIiEaLQl1PMnj2bwsJCYrEY27dvb7L9zTffzFNPPXVKfdq0adx///1nvoMikrKEQt/MzjWzYWbWpbk7JNm1efNmysvLqa6upqKigvHjxzfa/r777uPpp58+pf7kk0/y0EMPNVc3RSRFTYa+mXUCngMuAV4yswIzm2dm681sSly7lGvScixdupSysjKKioooLi5m//79HD16tMG2r7/+OgcOHOCaa645qX706FFWrlzJHXfckYkui0gSEpnp9wUmuPuDwPPAEKC1uxcDvczsQjMbnWqteYYlqdq1axd9+/Y9vl1UVMTOnTsbbNu7d28ef/xxWrU6+a9Rbm4uv/rVr2jfvn2z9lVEktfk5RLdfQ2AmQ0mmO2fy4lLHr4ADAIuTqO2Nd1ByJlTW1tLXl7e8e3c3FwOHTrUYFszy1S3ROQMSXRN34BrgIOAA7vDhw4AXYHcNGr132usmVWZWdW+ffuSHY+kqVOnTieF/LFjx06ZyYvIJ1dC/5o9cDvwGvBloO57e4fwNY6kUav/XnPdPebusYKCgqQHJOmJxWKsX78eAHenurqabt26ZblXInKmJPJD7t1mVhZu5gMPEyzLAPQDdgAb06hJC1JaWsqSJUtYvHgx5eXldO7cme7du2e7WyJyhjS5pg/MBZ41s5uBLcBS4BUzKwJKgYEESz6VKdakBcnLy2PRokXcddddtG3blvnz57Nu3ToefPBBli9fnu3uiUiazN2Tf1KwG+cw4BV335Nu7XRisZhXVVUl3b90zZ49m/vvv5/zzjuPhQsXcv755zfYrra2lltvvZWlS5dSUlLCz3/+c9q2bcvmzZu59dZbeeedd7j00kuZO3cu+fn5GR1Dz3s+PQG94+GRSbWP8tjh0zP+KI8dUht/HTPb6O6xhh5L6Rc6dz/o7s/Gh3Y6tZYkmYOTZs+ezY4dO9i9ezcDBgzgscceA+Cmm25iypQpvPvuu5x33nlMmjQpU90XEWmUdsuoJ5mDkxYvXszEiRNp06YNt912G7/+9a95++23OXToEKWlpZgZ1113HevWrcvwKEREGqbQryeZg5Pi23bs2JG6XUwPHz7MRx99BATfHLp00dkrRKRlSOSH3EhJ5uCk+m0//vhjLrjgAs4//3xGjx7NgAEDqKioYNq0ac3ebxGRRGimX08yByfVb/vXv/4VgJUrV3LZZZexadMmcnJy+OY3v9msfRYRSZRCv55kDk6Kb7t9+3bOPvtsAPLz85k4cSI5OTnce++9x+siItmm5Z16SktLmTBhAkOGDGHLli2NHpx07bXXcsstt1BQUMCjjz7KlVdeefyxN954g+rqap555plMdV1EpEma6ddTd3DS9OnTWbNmzfGDk0aOPHWf2cGDBzNx4kS+853vcM455zB16tTjj02dOpWpU6fStm3bTHZfRKRRn+qZfloHagy6F4Cv/mRbsN3nttO8XiEMf5BXgD4PvnKi3OsGfv9HuP8MHCySzkEaIiLxNNMXEYkQhb6ISIQo9EVEIkShLyISIQp9EZEIUeiLiESIQl9EJEIU+iIiEZLINXI7mtkKM3vBzJaYWRszm2dm681sSly7lGsiIpIZicz0rwOmu/tXgD3AN4DW7l4M9DKzC81sdKq15hmWiIg0pMnTMLj7zLjNAuCfgUfD7ReAQcDFwLMp1rbGv5+ZjQXGAvTo0SOpwYiISOMSXtM3s2KgE/BnYHdYPgB0BXLTqJ3E3ee6e8zdYwUFBUkNRkREGpdQ6JvZuUAFcBNwBGgfPtQhfI10aiIikiGJ/JDbBlgI3OvuO4GNBMsyAP2AHWnWREQkQxI5tfK3gAHAZDObDPwEuN7MioBSYCDgQGWKNRERyZAmZ/ruPsvdO7l7SXj7KVACbAAud/cadz+caq05BiUiIg1L6SIq7n6QE3vhpF0TEZHM0A+pIiIRotAXEYkQhb6ISIQo9EVEIkShLyISIQp9EZEIUeiLiESIQl9EJEIU+iIiEaLQFxGJEIW+iEiEKPRFRCJEoS8iEiEKfRGRCFHoi4hESKLXyO1qZpXh/Rwz+42ZrTWzm9KtiYhI5iRyjdxOwE+B3LB0B7DR3f87MMbMzkmzJiIiGZLITL8WuAY4HG6XcOLKV68AsTRrIiKSIYlcI/dwvWvZ5gK7w/sHgK5p1k5iZmPNrMrMqvbt25fcaEREpFGp/JB7BGgf3u8QvkY6tZO4+1x3j7l7rKCgIIXuiYjI6aQS+huBQeH9fsCONGsiIpIhZ6XwnJ8CvzWzS4GLgN8RLNmkWhMRkQxJeKbv7iXhf3cCw4C1wFB3r02ndkZHIyIijUplpo+7v8uJvXDSromISGboiFwRkQhR6IuIRIhCX0QkQhT6IiIRotAXEYkQhb6ISIQo9EVEIkShLyISIQp9EZEIUeiLiESIQl9EJEIU+iIiEaLQFxGJEIW+iEiEKPRFRCJEoS8iEiFZCX0zm2dm681sSjbeX0QkqjIe+mY2Gmjt7sVALzO7MNN9EBGJqmzM9Es4cbnEF4BBWeiDiEgkmbtn9g3N5gH/7u7/YWZfAQa4+8Nxj48FxoabXwDezGgHk9cF2J/tTmRJlMcO0R5/lMcOLX/8n3P3goYeSOnC6Gk6ArQP73eg3rcNd58LzM10p1JlZlXuHst2P7IhymOHaI8/ymOHT/b4s7G8s5ETSzr9gB1Z6IOISCRlY6a/FKg0syKgFBiYhT6IiERSxmf67n6Y4MfcDcDl7l6T6T6cYZ+YpahmEOWxQ7THH+Wxwyd4/Bn/IVdERLJHR+SKpMDMzjWzYWbWJdt9EUmGQj8NZtbVzCqz3Y9MM7OOZrbCzF4wsyVm1ibbfcokM+sEPAdcArxkZg3uGvdpFv7d35TtfmSSmZ1lZn8ys5fDW59s9ykVCv0Uhf/wfwrkZrsvWXAdMN3dvwLsAYZnuT+Z1heY4O4PAs8DA7Lcn2z4N07seh0VfYFfuHtJeNuc7Q6lQqGfulrgGuBwtjuSae4+091XhZsFwN5s9ifT3H2Nu28ws8EEs/312e5TJpnZEOAowQd+lAwErjCz34fnD8vG3o9pU+inyN0Pfwr2PEqLmRUDndx9Q7b7kmlmZgQf+geBj7LcnYwJl/K+B9yT7b5kwR+Aoe5+CZADjMhyf1Ki0JeUmNm5QAVwU7b7kg0euB14DRiV7f5k0D3ATHc/lO2OZMFr7v5eeL8K+ESeLFKhL0kLZ3sLgXvdfWe2+5NpZna3mZWFm/lAlAJwKHC7mb0M9DezH2W5P5n0tJn1M7PWwJXAf2S7Q6nQfvppMrOX3b0k2/3IJDMbBzzEib/0s9z9l1nsUkaFP+I/C7QFtgC3ewT/IUXt776ZfRF4BjBgmbtPznKXUqLQFxGJEC3viIhEiEJfRCRCFPoiIhGi0BcRiRCFvohIhCj0RUQiRKEvIhIh/x9/lr+o11Xz7wAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "x = np.arange(1,6)\n",
    "nums = [len(comments[comments['star']==i]) for i in x]\n",
    "\n",
    "def plot_score_distribution(x, nums):\n",
    "    plt.bar(x, nums)\n",
    "   \n",
    "    plt.title('影评分数的分布')\n",
    "    for x,y in zip(x,nums):\n",
    "        plt.text(x,y+100,round(y/sum(nums),2),ha='center',fontsize=12)\n",
    "    plt.show()\n",
    "    \n",
    "plot_score_distribution(x, nums) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0x7fb70c587c90>"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD2CAYAAAA6eVf+AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAUbUlEQVR4nO3db4xk11nn8e+TiS31diWTsRwV63kR25K1q0B7sKcwM+CY6sjO2mv+WrCxdhZkBdSCmCDBSMtAjCJQAkbC4Y+Fo+3F2AEMLRMgjrFjhxduMgQ7xE2EO7sQCZY2ohd7QDO0KWtATHh4UWV5ZtxTf+7tutX2+X6k1lSfe2+f55y6/Ztb93bVjcxEklSGN826AElScwx9SSqIoS9JBTH0Jakghr4kFeTNsy5gmEsvvTQvv/zyytu//PLLzM/P71xBO8S6JmNdk7GuybwR61pbW/uHzHz7tgszc9d+HTx4MOt46qmnam0/LdY1GeuajHVN5o1YF/BsXiBXPb0jSQUx9CWpIIa+JBXE0Jekghj6klQQQ1+SCmLoS1JBDH1JKoihL0kF2dUfwyBpd7n82GOVtz26cIY7Km6/cfetlfvVuTzSl6SCGPqSVBBDX5IKYuhLUkEMfUkqiKEvSQUZGfoRsS8iHo+IZyPifw3a7o+IpyPirrPWq9wmSWrGOEf63w08lJkd4C0R8T+BPZl5GLgyIq6KiNuqtk1pXJKkbUT/zlpDVog4AnwN8LPAo8CXgEcz8/GIuB2YA64BnqjSlpkPnNffErAE0G63D66srFQeXK/Xo9VqVd5+WqxrMtY1mWnWtb65VXnb9hy8eLratgv791bud5Q34vO4uLi4NjhQf41x3pH7R8CtwA8Bfw5cDGwOlp0ErgXma7SdIzOXgWWATqeT3W53jBK3t7q6Sp3tp8W6JmNdk5lmXVXfUQv9d+Tes17tQwA2jnQr9ztKac/jOKd3PgR8f2b+FPAXwH+nf9QO0Br8jF6NNklSQ8YJ3X3AQkTsAb4euBu4frDsALABrNVokyQ1ZJzXWj8DPAC8A3ga+HngeERcBtwCHAKyRpskqSEjj/Qz808y86szs5WZN2XmS0AXeAZYzMytOm3TGJQkaXuVrqpk5ing4Z1qkyQ1wwupklQQQ1+SCmLoS1JBDH1JKoihL0kFMfQlqSCGviQVxNCXpIIY+pJUEENfkgpi6EtSQQx9SSqIoS9JBTH0Jakghr4kFcTQl6SCjLyJSkT8APDewbdvAz4/2O6dwGOZ+eHBevdXbZMkNWOc2yV+LDO7mdkFjgN/BezJzMPAlRFxVUTcVrVtaiOTJL1GZOZ4K0bsp39T9BeAJzLz8Yi4HZgDrqnalpkPnNfPErAE0G63D66srFQeXK/Xo9VqVd5+WqxrMtY1mWnWtb5Z/bbW7Tl48XS1bRf2763c7yhvxOdxcXFxLTM72y2b5B65dwIfA/4HsDloOwlcC8zXaDtHZi4DywCdTie73e4EJZ5rdXWVOttPi3VNxromM8267jj2WOVtjy6c4Z71SrflZuNIt3K/o5T2PI51ITci3gQsAqtAj/5RO0Br8DPqtEmSGjJu6L4L+Hz2zwWtAdcP2g8AGzXbJEkNGfe11n8BPjt4/EngeERcBtwCHAKyRpskqSFjHeln5o9n5u8OHr8EdIFngMXM3KrTtrPDkSQNU+mqSmaeAh7eqTZJUjO8kCpJBTH0Jakghr4kFcTQl6SCVHt7nCTWN7dqvUO1qo27b228T71xeKQvSQUx9CWpIIa+JBXE0Jekghj6klQQQ1+SCmLoS1JBDH1JKoihL0kFMfQlqSBjh35E3BcR3zJ4fH9EPB0Rd521vHKbJKkZ494Y/V3AV2XmoxFxG7AnMw8DV0bEVXXapjQuSdI2RoZ+RFwE/G9gIyK+jf7tDl+589Vn6N/ovE6bJKkhkZnDV4j4XuBW4P3AB4BjwLWZ+WcR8R7gWuAq4JeqtGXm3ef1twQsAbTb7YMrKyuVB9fr9Wi1WpW3nxbrmsxurevEyS1ePN18vwv79w5dPs35Wt+sflvr9hyV52vUmOvYrftXnboWFxfXMrOz3bJxPlr5GmA5M1+IiN8AvgGYGyxr0X+10KvRdo7MXAaWATqdTna73TFK3N7q6ip1tp8W65rMbq3r3oce4Z715j+dfONId+jyac5XnY+SPrpwpvJ8jRpzHcPm6/IZfHT2Kx68uTWV53Gcc/p/CVw5eNwBLufV0zIHgA1grUabJKkh4/y3ez/wqxFxO3AR/fPyn4qIy4BbgENAAscrtkmSGjLySD8z/ykzvyszb8jMw5n5PP3gfwZYzMytzHypats0BiVJ2l6lE2yZeYpX/wqndpskqRm+I1eSCmLoS1JBDH1JKoihL0kFMfQlqSCGviQVxNCXpIIY+pJUEENfkgpi6EtSQQx9SSqIoS9JBTH0Jakghr4kFcTQl6SCGPqSVJChoR8Rb46Iv4mI1cHXQkT8ZER8ISJ++az1KrdJkpoz6kj/auC3MrObmV3gYvo3Nr8OOBERN0bEwapt0xmSJOlCIjMvvDDi/cCdwMvAOvBloJeZ90XEIfo3N98C/rlKW2Z+aJs+l4AlgHa7fXBlZaXy4Hq9Hq1Wq/L202Jdk9mtdZ04ucWLp5vvd2H/3qHLpzlf65vVb2vdnqPyfI0acx3D5qvOeOu6Yu+eys/j4uLiWmZ2tls26h65XwBuzMy/i4hfA+boBz/ASaANnAH+qmLba2TmMrAM0Ol0stvtjijxwlZXV6mz/bRY12R2a133PvQI96xXus10LRtHukOXT3O+7jj2WOVtjy6cqTxfo8Zcx7D5qjPeuh68eX4qz+OoZ+C5zPyXweNngYvoBz9Ai/7poV6NNklSg0YF769HxIGI2AN8OzBP/7w8wAFgA1ir0SZJatCoI/2fAn4TCOBTwIeB4xHxi8DNg6/ngZ+p2CZJatDQI/3M/FJmXp2ZC5n5wcz8N+BG4Dj9C7F/XadtmgOTJL3WxFdVMvM08ImdapMkNceLqZJUEENfkgpi6EtSQQx9SSqIoS9JBTH0Jakghr4kFcTQl6SCGPqSVBBDX5IKYuhLUkEMfUkqiKEvSQUx9CWpIIa+JBXE0JekgowV+hHRjogvDh7fHxFPR8RdZy2v3CZJas64R/o/B8xFxG3Answ8DFwZEVfVaZvGgCRJFxaZOXyFiHcD/w34z8BzwBOZ+XhE3A7MAddUbcvMB7bpbwlYAmi32wdXVlYqD67X69FqtSpvPy3WNZndWteJk1u8eLr5fhf27x26fJrztb65VXnb9hyV52vUmOsYNl91xlvXFXv3VH4eFxcX1zKzs92yoffIjYiLgZ8AvgP4JDAPbA4WnwSurdn2Gpm5DCwDdDqd7Ha7Qwc3zOrqKnW2nxbrmsxurevehx7hnvWJbzNd28aR7tDl05yvO449VnnbowtnKs/XqDHXMWy+6oy3rgdvnp/K8zjq9M4x4L7M/MfB9z36R+0ArcH2ddokSQ0aFbw3AndGxCrwtcC3ANcPlh0ANoC1Gm2SpAYNfa2VmTe88ngQ/N8KHI+Iy4BbgENA1miTJDVo7FMsmdnNzJeALvAMsJiZW3XadnIgkqTRJr6qkpmngId3qk2S1BwvpkpSQQx9SSqIoS9JBTH0Jakghr4kFcTQl6SCGPqSVBBDX5IKYuhLUkEMfUkqiKEvSQUx9CWpIIa+JBXE0Jekghj6klSQsUI/Ii6JiJsi4tJpFyRJmp6RoR8R+4DfB64DnoqIt0fE/RHxdETcddZ6ldskSc0Y50j/auBHMvMjwJPAu4E9mXkYuDIiroqI26q2TWdYkqTtRGaOt2LEDcCHgb8EPpGZj0fE7cAccA3wRJW2zHzgvH6WgCWAdrt9cGVlpfLger0erVar8vbTYl2T2a11nTi5xYunm+93Yf/eocunOV/rm9Vvbd2eo/J8jRpzHcPmq85467pi757Kz+Pi4uJaZna2WzbWPXIjIoD3AqeABDYHi04C1wLzNdrOkZnLwDJAp9PJbrc7TonbWl1dpc7202Jdk9mtdd370CPcsz7xbaZr2zjSHbp8mvN1x7HHKm97dOFM5fkaNeY6hs1XnfHW9eDN81N5Hse6kJt9dwLPAd9A/6gdoDX4Gb0abZKkhoxzIfdHI+J7Bt++DbgbuH7w/QFgA1ir0SZJasg4r7WWgYcj4vuALwGfBD4bEZcBtwCH6J/yOV6xTTvk8povRY8unKn8cnbj7ltr9S2pGSOP9DPzVGbelJk3ZOb7M3ML6ALPAIuZuZWZL1Vtm8agJEnbq3RVJTNPAQ/vVJskqRleSJWkghj6klQQQ1+SCmLoS1JBDH1JKoihL0kFaf6DQxq0vrk1k8/O8I1KknYrj/QlqSCGviQVxNCXpIIY+pJUEENfkgpi6EtSQQx9SSqIoS9JBRnndol7I+LTEfGZiPi9iLg4Iu6PiKcj4q6z1qvcJklqxjhH+keAj2bme4AXgNuBPZl5GLgyIq6KiNuqtk1nWJKk7URmjr9yxCeAtwK/kJmPR8TtwBxwDfBElbbMfOC8PpaAJYB2u31wZWWl8uBOnNzixdOVN69sYf/eoct7vR6tVmvH+13frHf3yfYcledr1JjrmNZ81VXa/gX19rHX4/5V93eqjiv27qn8PC4uLq5lZme7ZWN/9k5EHAb2ARvA5qD5JHAtMF+j7RyZuUz/Zux0Op3sdrvjlvga9z70CPesN//xQhtHukOXr66uUmdcF1L3c4aOLpypPF+jxlzHtOarrtL2L6i3j70e969ZfHbXKx68eX4qz+NYF3Ij4hLgXuB9QI/+UTtAa/Az6rRJkhoyzoXci4HfBn4sM58H1oDrB4sP0D/yr9MmSWrIOK+1vpf+aZgPRsQHgQeA746Iy4BbgENAAscrtkmSGjLySD8zP5aZ+zKzO/j6ONAFngEWM3MrM1+q2jaNQUmStlfpqkpmngIe3qk2SVIzvJAqSQUx9CWpIIa+JBXE0Jekghj6klQQQ1+SCmLoS1JBDH1JKoihL0kFMfQlqSCGviQVxNCXpIIY+pJUEENfkgpi6EtSQQx9SSrIuDdGb0fE8cHjiyLi0Yj4XES8r26bJKk549wYfR/wcWB+0PQBYC0zvxH4zoh4S802SVJDIjOHrxDxViCARzKzGxGfAo5l5v+NiGPA54EfrtqWmU+d198SsATQbrcPrqysVB7ciZNbvHi68uaVLezfO3R5r9ej1WrteL/rm/VuOdyeo/J8jRpzHdOar7pK27+g3j72ety/6v5O1XHF3j2Vn8fFxcW1zOxst2zkPXIHNzMnIl5pmgc2B49PAu2abef3twwsA3Q6nex2u6NKvKB7H3qEe9Yr3Qa4lo0j3aHLV1dXqTOuC7nj2GO1tj+6cKbyfI0acx3Tmq+6Stu/oN4+9nrcv+r+TtXx4M3zU3keq1zI7QFzg8etwc+o0yZJakiV0F0Drh88PgBs1GyTJDWkymutjwOPR8S7gHfSP1e/WaNNktSQsY/0M7M7+Pd54Cbgc8CNmfmVOm07OhpJ0lCVrqpk5v8HHt6pNklSM7yQKkkFMfQlqSCGviQVxNCXpIIY+pJUEENfkgpi6EtSQQx9SSqIoS9JBTH0Jakghr4kFcTQl6SCGPqSVBBDX5IKYuhLUkEMfUkqyExCPyLuj4inI+KuWfQvSaVqPPQj4jZgT2YeBq6MiKuarkGSShWZ2WyHEb8EPJGZj0fE7cBcZj5w1vIlYGnw7X8Cvlyju0uBf6ix/bRY12SsazLWNZk3Yl3vyMy3b7eg0j1ya5oHNgePTwLXnr0wM5eB5Z3oKCKezczOTvysnWRdk7GuyVjXZEqraxbn9HvA3OBxa0Y1SFKRZhG4a8D1g8cHgI0Z1CBJRZrF6Z1PAscj4jLgFuDQFPvakdNEU2Bdk7GuyVjXZIqqq/ELuQARsQ+4CfhsZr7QeAGSVKiZhL4kaTa8iCqNEBGXRMRNEXHprGs5226tS7vbGyL0I6IdEceHLL8oIh6NiM9FxPt2UV37I+JvI2J18LXt39XuYD17I+LTEfGZiPi9iLj4Aus1+o7pceqKiDdHxN+cNVcLDdW2D/h94DrgqQs9RzOYs5F1zXDO2hHxxSHLZ/KO/GF1zXCuxuo3In4yIr4QEb9ct8/XfegPdv6P0//7/wv5ALCWmd8IfGdEvGWX1PX1wEcyszv4+vspl3UE+Ghmvgd4Abj5/BVm9I7pkXUBVwO/ddZcrTdQ1yv9/khmfgR4kvPeVwIzm7ORdTG7Ofs5Xv2z7HPM+B35F6yL2e5fQ/uNiIP0/+LxOuBERNxYp8PXfegDXwHeC7w0ZJ0u8PDg8WeBJt6IMU5dh4Dvi4g/jYifnnZBmXlfZv7B4Nu3Aye2Wa3Lq3P1GV7989pZ13UI+OaI+JPBkWIjf3mWmX+Ymc9ExA30f+me3ma1Ls3P2Th1NT5nEfFu4GX6/3lvp0vDcwVj1TWT/WvMfr8J+J3sX4B9EnhXnQ5f96GfmS9l5taI1c5/F3B7ulWNXden6f8SfB1wOCKunnZdABFxGNiXmc9ss7jxuRqzri8AN2bmdcBFwH9tsK6g/x/4KeBft1llJnM2Rl2NztngtNxPAMeGrNb4XI1Z16z2r3H63dE5e92H/ph267uA/zgz/ykzvwJ8EZj6S92IuAS4F7jQtY2ZzNUYdT2XmX83ePwsDczVK7LvTuA54Fu3WWUmczZGXU3P2THgvsz8xyHrzGKuxqlrVvvXOP3u6JztlvCbtt36LuAnI+I/RsR/AN4DfGmanQ2OeH4b+LHMfP4CqzU+V2PW9esRcSAi9gDfDvzZtOsa1PajEfE9g2/fBmwXHLOYs3HqanrObgTujIhV4Gsj4le2WWcWv4vj1DWT/WvMfnd2zjLzDfEFrA7+fTfwg+ctewfwf4BfpP9yas8uqWsR+Av6R2o/2EAtP0D/VMDq4OtDwIfPW+etgx3vo8CfA3t3SV1fM5indfoXv5t6/vYBf0D/WtB9wFfvkjkbp66ZzNmg71Xgnbthrsasa1b71zn9ApcAv3LeOm8CPjfIry8DV9Tps5g3Z0X/Yx+uB57M0efaixa+Y3piztn4nKvJRcQccCvwp5n5/2r9rFJCX5JUzjl9SRKGviQVxdCXpIIY+pJUEENfkgry71tHIoWJ+PiuAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "comments['star'].hist()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 文本预处理\n",
    "- 删除符号\n",
    "- 繁体转简体\n",
    "- 分词后以空格连接\n",
    "- 英文大写转小写"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Dumping model to file cache /tmp/jieba.cache\n",
      "Loading model cost 0.440 seconds.\n",
      "Prefix dict has been built successfully.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0                         吴京 意淫 到 了 脑残 的 地步 看 了 恶心 想 吐\n",
       "1    首映礼 看 的 太 恐怖 了 这个 电影 不讲道理 的 完全 就是 吴京 在 实现 他 这个...\n",
       "2    吴京 的 炒作 水平 不输 冯小刚 但小刚 至少 不会 用 主旋律 来 炒作 吴京 让 人 ...\n",
       "3                     凭良心说 好 看到 不像 战狼 1 的 续集 完虐 湄公河 行动\n",
       "4                                                中二得 很\n",
       "5                   犯 我 中华 者 虽远必 诛 吴京 比 这句 话 还要 意淫 一百倍\n",
       "6                            脑子 是 个 好 东西 希望 编剧 们 都 能 有\n",
       "7    三星 半 实打实 的 7 分 第一集 在 爱国 主旋律 内部 做 着 各种 置换 与 较劲 ...\n",
       "8    开篇 长镜头 惊险 大气 引人入胜 结合 了 水平 不俗 的 快 剪下 实打实 的 真刀真枪...\n",
       "9    15 100 吴京 的 冷峰 在 这部 里 即 像 成龙 又 像杰 森斯坦 森 但 体制 外...\n",
       "Name: cleaned_comment, dtype: object"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "punct = r\"，。！？、；：“”\\n＂＃＄％＆＇（）＊＋－／＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､〃《》「」『』【】〔〕〖〗〘〙〚〛〜〝〟〰〾〿–—‛„‟…‧﹏★☆•→▽\"\n",
    "\n",
    "\n",
    "def clean_special_chars(text):\n",
    "    re_tok = re.compile(f'([{string.punctuation}{punct}])')\n",
    "    return re_tok.sub(r' ', text)\n",
    "\n",
    "\n",
    "def simplify(text):\n",
    "    return zhconv.convert(text, 'zh-cn')\n",
    "\n",
    "\n",
    "def cut_join(text):\n",
    "    space = ' '\n",
    "    words = jieba.cut(text)\n",
    "    return space.join([w.lower() for w in words if not w.isspace()])\n",
    "\n",
    "\n",
    "def preprocess(text):\n",
    "    text = clean_special_chars(text)\n",
    "    text = simplify(text)\n",
    "    text = cut_join(text)\n",
    "    return text\n",
    "\n",
    "\n",
    "comments['cleaned_comment'] = comments['comment'].apply(preprocess)\n",
    "\n",
    "comments['cleaned_comment'].head(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 查看影评的字数分布\n",
    "- 5% 的影评字数少于 3 个，删除这些数据；删除后对类别分布无影响\n",
    "- **todo：更好的方法处理不同影评的这种字数差别**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1 4247 19.0 34.62489770319778 7\n"
     ]
    }
   ],
   "source": [
    "comments['comment_length'] = comments['comment'].apply(len).astype('int')\n",
    "\n",
    "min_ = comments['comment_length'].min()\n",
    "max_ = comments['comment_length'].max()\n",
    "median = comments['comment_length'].median()\n",
    "mean = comments['comment_length'].mean()\n",
    "mode = comments['comment_length'].mode()[0]\n",
    "print(min_, max_, median, mean, mode)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAlwAAAHxCAYAAAC4dmIdAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAgAElEQVR4nOzde7yVZZ3//9eHzfkQAuIBHQ+oKRKSwaR2oG2Tlo7kJAfpiFZfrN9Mfv3VjKO/CLeMll8ffR2n5ptJKdG3KWVLY1JjajqmeShhUBEpLaFGhTJOioJyuH5/rLVhs7k3ewHr3vfae72ej8dur3Wva93rs24N3l7XdV9XpJSQJElSfnoUXYAkSVJ3Z+CSJEnKmYFLkiQpZwYuSZKknBm4JEmScmbgkiRJypmBS1KXFBFRdA37Y1/qj4i/iIiZEXFoHjVJyo+BS1JVRESfiDiizbEZEXF4q+cDIqJXxnuj/P5+Ga+9LyIejohhbV66OCLujIjee1Hj+yLip1k1ZLQ9JiIuLz++OSK+Xa7z/0TEIRFxfEQsi4ijK/zspog4o/z4cOD5iHhzpbWXnQfMBDqsX1JtMXBJaldEvCkiUkQ8HxErW/38KSJ+06b51cDDETGk/N4GYDJwX0QML7d5AXijfM4dP8B2YDNwZ0YZbwEGp5TWtDl+OvBqSumNvfhKy4DR5Vo7sgaYGhGXAG8AW4C/ASYAfwLeCgwGft/RiSLiWEpB6XCAlNLz5Vr+aQ/vGVQOoVF+HsBngHkppT+0aRsR0RAR/Sv4XpIKEK40L6k95R6n14BXKIWiFr2AX6eUxpXbTQeuBd6bUlrW5v33AS+klCaXh8K2ANsoBakHgKPK528ASCm91KaGbwF/Sil9sdWxA4BVlALLw62a/3dK6XflNtOAH+zlV/5fKaXLyu8/HRgLbKDUs5SAxUA/4FZgRqnc9P+U2/cA+qSUNrU9aUR8BxgPjE0pbSsfGwf8CvhYSmm3OstBdF8MSSmt38f3SspJz6ILkFTTWnrBT0oprWw5GBF/A8wqP/4sMBt4X+uwBZBS2lRu2xIeJgOPppQei4hXysc2pJTWR8Q1wIvA18rnnUmrHqCI+P/KD0cD7wGCUuiZUT5+IPAN4NLy89eB36eUjqrki0bE7eX3tJgInAgcDbwZeK78PXoCfwDOAI4pf/8WT1Dq+Wp93jOB6cBZLWGrfG0WR8R1wLci4rmU0i/blHQEpXD6BqWA+xjwQ6AJOAV4nlKPYVsbKvm+kjqXQ4qS9qQHpb/AH4+I9S0/wHeA7RFxEnAR0JhSeiIiToqIAyJiWEQcFxFHUeoRahkOHA98pe2HRMSbgP8J9Gl1eBPwEDCk/HNU+fgW4O+AK1NKR7X8AP9OKZy0eBl4NCIGRsTwcl3t/fQBfgn8ruXNKaXPA58D3gQsBx4E/hv4EKXwcyQwMKUUwMXAPcDb23yvA4Gbge+llH6acX0vp9TLd29ETG79Qkrpv1NKq1NKa4HPUgpds8q9V9cD70wprc/4cdhCqkH2cEnKFBE9ga3AsJTStoh4LzAXGFl+3gD0Bd6WUtoeEYcBdwHfphSw/rnV6Q6n1BtzLfBURLydXcPRpygFrBtaHdsGbG0ZHmt1U9//AIYCX29Tci9a9VCllO6lFGQuKNe9J/9vSmmXIBgR5wLfAr4MHEvpz8s3l8+1rvy8P/AqcAilYc83Wr1/APAjSnPTPh8R0TYMpZS2RsSHKAXY5oiYD1ySUlrV6jx/BXwRODel9HL58Jbyj6Quwh4uSe35a0q9WxvKvVo/pDRJfE35+YbyzxERMZBSuPgRpaHG/0MpALXcwfcGQHnI8S7gsDafdSrwrymljRXUNZ/S0OQV5RDYohelcNPWvwG9U0rR+odScHsZ+CbwvdZvKPe4XUSp1+0mSnPMngA+DDwCfJJS6BpbfsvBwIo2n/t9SgHtWkqT7Le3vVmgPE9rM/Ab4B+AUcCOaxARJ5e/b0/gJ63eMwa4uc257qng2kkqiD1ckjKllH4UEYMoBYEsa1NK34jSUhAPAL8A/me5F2cL7NIrtaU8vHYgpeGxRGl+FJTmKv0j0FBeJmFNqzsS35MxefzPKaWVEXEhpTv/7isf782uc7BavscuPUERcSKl3rEjKM2rejjjPS9HxJTy95hKKWAdm1J6JSIWA3dQCpvvBn5GKSjd2+Y0n6M0FPocpeHK7cASSsOP97Vq9zDwh5TSzRHxtZZesoh4N6UA+3Pgr4CPULrGUBrevIFSqAO4jNJNCJJqlIFL0p40UJq4fgHwx1bHTwM+FRH3UQpbX04pXQ8QEeNTSosyzvVRSr09r1MKHw3l4w9SCmA9KP2ZdDnwL+XXHgLOKT8eDKxsdb6vU5pb9taU0uOU5n/t6OFqtRRF26G3j1HqmXorsLF8xyOUJuH3pTQ0uI3SEOjgVu97uRwg1wMHAR8EvhQRXwFOZte7JSkv3dCyfMMTEXFc+Tvfl1L6dbnGXsABlCbA0ypsjaMUyhYC0yj1kG1sNby6DXit1fPNlIZ/JdUoA5ekPWkJRV9n92Uhfp9S+nVEHJNSegWgPF/qxog4sWV5hla+nVJqCVJExFsp9fgcuYdlDLLmcAFQnqT/OKVA8jilyfmthySbKd3N2J7/buf40ZSC3QjgLygt3fDmlNIfI+KbQM+U0paI+A9gDnAN8HxKqaP1uD4A/JnS8GGLQykFvV1qKd/B+NfAveX5ch2cWlKtM3BJald5CO0gYC0wiNK8pZMp9bb8tqUNQEScT2mYa3pG2DoKWBARf93Su1MlF1JaQBRgIKU5WS3OAra1XRi1vJDpJW2XiyjfBDCE0nckpfRaRFxWPueFEfEMpV66t5Vf3xQRcygN531hT0WWJ9B/ntLdiq2HSEeUfz/f9j0ppbtbPe1ovm1vdi69IakGOWleUrvKk8fvA/6+1eGewN3ldbKIiB7lNbJuAqallG5p1bZlC5qfUhoe/O1eltCzZekGdh3eAyCl9HirOVpDabUGVUpp096sQp9S2pZS+nPrtbIorfH1UUp3WS6gFGw+HxEHlwNaS2A6JtrphirfULCA0pBn2xXujwFebgmte9An62BEvCUi7qK0TMbKDs4hqUAGLkmZypPcf05pSO2fgRPKL71GaQL3wIgYQ2kO1/8ATk8p/ajNaT5d/v19Sj1fezPPqBfwTko9TuvYGSiy9mI8lNKdj1kLge7WfC9qoHzevwb+N3ASpWHLI4H7Kd2F2DKh/bvlcNW6rr+iNNH9LZTWKvtz+fiEiPg88P8BWfPd2voJpeHIFj3K3+NpSvPW/gW4ci+/l6RO5JCipPasBa5KKS2IiPGUlkN4CPhdSun1cofOEuAW4Jx25mHdDmxPKV2e8VpLcGrvP/z6AD9PKTXCjt62J2kVmCJiFqUV398C/J+U0nPtfZmIeA/QCJzNrkOP7bWfSWlZiGXAx1NKvygf/zOlIPpt4NLy0OIHKPVifY/SfostE9/voHT34sSUUut5Wq9TClsP0cFwJEBK6bw2hxoozSXbDpzb0fslFc+9FCVVJCKOTimtaHNsdNvtfPbifO+idIfiYSmlF/fxHCdSWnT0sY6G5cqLrd4KPAt8PaW0sIP2I4H+KaWn2hw/BDgw4/gwSnsrrm117IjUZqPpaoiI3wLXp5T+tdrnlpQPA5ckSVLOKprDFRE3RcQjLZNk96ZNeXLpkr05lyRJUnfSYeCKiPOAhpTSacDI8uJ9e9Pmq5TWx6noXJIkSd1NJZPmGynt5QVwN/AuSnMgOmxT3ufsVWB1peeKiBmUbsVmwIAB40444QQ63Wvlm4H6H9j5n52DbevWAdAwZEjBlRRv3ebStRjS12shSaquxYsX/zmlNDzrtUoC1wB23mq9lvKifx21iYjewJeAD1G6U6mic6WU5lBavZnx48enRYsquWNakiSpWBHR7o4Tlczh2kh5SJDSSs5Z78lqcxnwjTa3ildyLkmSpG6lksCzmNLQH5Q2fF1ZYZv3AX8bEfcDb42Ib1d4ruItmlv66SbW3TqfdbfO77hhHWh+ppnmZ5qLLkOSVGc6XBaivNjgg5QW7zuL0kaxU1JKM/fQ5tSU0oZWr9+fUmrsqF1bhQ0pNpV3EGlqt7QuZfkJowAY9evlBVdSvDHzxgCwdPrSgiuRJHU3EbE4pTQ+67UO53CllF6OiEZKqzlfm1JaDTzRQZsNbV5vrKSdJEmCLVu28Pzzz7N58+aiS1GGvn37cvjhh9Or1247jbWroq19Ukrr2Hl34T632Zt2kiTVq+eff55BgwZx1FFH0c6+6CpISok1a9bw/PPPc/TRR1f8PietS5JUYzZv3sywYcMMWzUoIhg2bNhe9z4auCRJqkGGrdq1L/9sDFySJEk5M3BJkqT9dsEFF7By5cqqn/eKK66gsbGRxsZGTjjhBL7yla9kttuyZQsTJ07kne98JzfffHO7x4pS0aT5utNNloNo4XIQO7kchCTVhnPPPZcNG3b+ffuRj3yEGTNm7Nbuyiuv3PF48uTJfOITn8g839e//nXGjRtHU1MTZ599NlOmTOFb3/rWbscGDRpU/S9TAQOXJEm1rmV9yCznXA/jLyw9XjQXfnzJHs5TWYfCuHHjOOigg+jduzerV6/mwgsvZOrUqUyfPp3169czbtw4rr/+elasWMFHP/pR+vfvz8svvwzAH//4Ry644AI2bNjAxIkTufzyyzM/40c/+lFFtbR47LHHOPzwwznssMMyX7///vu55pprAJgwYQKLFi3KPHb66afv1edWi0OKkiRpF6+99hrNzc08+eSTfP/73+eXv/wlX/7yl5k2bRoPPvggGzZs4Kc//SnXXnstl156KT/96U955ZVXAPjKV77C+eefz8MPP8ztt9/OmjVrqlLTv/zLv/C5z32u3ddfffXVHWFs6NCh/PGPf8w8VhR7uLLcOKH0+6IHiq2jSlacNwmAo3+4oOBKijd14VQA5k90KThJXUilU13GX7izt2s/HHzwwQwcOJAjjzyShoYGUko8/fTTfOYznwHglFNOYfny5axYsYKxY8fSs2dP3vrWtwLwm9/8hkceeYTvfOc7vPrqq7z44osMGzZst8+odEgRYP369fzpT3/imGOOabfmgQMHsmnTJgYPHszGjRsZOHBg5rGiGLiyrHqi4zZdyOanny66hJqxfK3z2SRpX4wePZpHH32UY489lkcffZSPfOQjLF++nGXLlnHEEUewdGlpjuzxxx/Pueeey+mnn873vvc9hg4dmnm+vRlS/NGPfsTZZ5+9xzbjxo3jF7/4BZMnT+aJJ57g1FNPzTxWFAOXJEnq0OWXX84nPvEJvvnNbzJ+/HjOPPNMRo4cycc+9jGuu+46evfuDcBll13Gpz71KWbOnMnRRx/NtGnT9vuz77rrLv7+7/9+x/P77ruPp59+mr/7u7/bcWz69OmcffbZPPjggzz99NOccsopHHbYYbsdK0qHm1cXyc2rq8PNq3dy82pJXcHy5csZNWpU0WV0OS+++CK/+MUveP/738/gwYPbPVYNWf+M9mvzakmSpK5gxIgRTJ06tcNjRfAuRUmSpJwZuCRJknLmkGKWt00vuoKqOmDKlKJLqBmTjptUdAmSpDpk4Mrywa8VXUFVHfpPs4suoWY0vaOp6BIkSXtp1apVLFu2jFNOOaWwrXn2l0OKkiSpZj3zzDOcf/75PPTQQ7znPe/hjTfe2PHaU089xRlnnNHhOSZOnMjjjz8OwB/+8AcaGxt573vfy4wZM+is1Rrs4cry4pLS7xEnF1tHlWx6ahkA/d4yuuBKirdsTelajB7mtZCkIlW60vyTTz7J3LlzOeaYY1i6dCkrVqzg+OOPJ6XE5z//ebZs2bLHz/m3f/s3jjnmmB0r4d94443ccMMNjBo1irPOOoulS5dy0kknVffLZTBwZZnTWPrdTdbhWjl5MuA6XADTflxagM91uCR1JS1rCGaZddospry5NFe3+ZlmZj/S/jSSSv/sy9q8+kMf+tBum1K/+OKLTJ06lYhgwoQJXH311VxwwQWMHDmSe+65h23btnHvvffSr1+/3T6j0pXmJ0+ezNatW/nJT37CunXrOPbYYwGYO3cup59+OnfddVe77127di1f+MIX+OxnP8t//ud/cvrpp3P11VfveH3NmjUceOCBFdWxvxxSlCRJu8javDprU+oXXniBa665hjvvvJOFCxfueP/GjRt58MEHOeGEE1iyZMl+17Nx40bmz5/PkUceSUSwZs0avve97+2y+nyWf/7nf2bKlClcdNFFfPe73+WOO+7Y8dqtt97K6NGjGTFixH7XVwl7uCRJqnGV9kxNefOUHb1d+yNr8+qsTal79uzJlVdeycCBA3nllVd2vH/69NLd/kccccQuc65a25vNqw844ADmzZvHxz/+cR577DG+/e1v85WvfIVevXrt8XssWbKEr371qxxyyCFMnTqVe+65hw9+8IM899xzfPWrX+VnP/vZ3l6afWbgkiRJHcralPqyyy7j8ssvZ+zYsbvMgxowYECH56t0SPGzn/0sH/7wh5kwYQLr16/ngAMO4Oc//znPPvssAI8//jgzZ87kqquu2u29xx57LM899xwnnHACixYt4sgjj2TdunV8+MMf5uabb67qVj8dMXBJkqQOZW1Kfc455/CZz3yG4cOH079/f1544YWqf+6ll17Kxz/+cSKCM888k+OPP55nnnlmx+uNjY1cddVVmRtaX3rppXz605/m6quvpn///vzwhz/kqquu4g9/+AOf+9znALjyyit5z3veU/W623Lz6ixuXt1tuXm1pK7Azatr395uXu2keUmSpJw5pJhlxv3cOAdWNe081NTUTtsu4Kjbbiu6hJpxyzm3FF2CJKkOGbiyjDiZVUXXUEUueLqTC55KkorgkKIkSVLODFxZ7riYiVxcdBVVs+pLs1j1pVlFl1ETmh5uounhpqLLkCTVGQNXlv+axzjmFV1F1axvbmZ9c3PRZdSEBc8uYMGzC4ouQ5K6nQsuuICVK1cCsHr1aq655ppOr2H58uWce+65O57fc889NDY2ctppp/GDH/yg3fetX7+eCRMm8M53vpM777yz3WP7wzlckiSpqg455BAuu+yyqpyr0hXpf/e73/EP//APbNy4EYBt27bxhS98gYceeohevXoxduxYPvShD9G3b9/d3jtr1iw++clP8vGPf5z3ve99fOADH8g8FhH7/D0MXJIk1biW9RSzHHLllQw5fyoA626dz+orrmi3baXrMVa6efWKFSv46Ec/Sv/+/Xn55Zd3vH/lypU0NTXxne98B6BTNrkeNGgQCxYs4P3vfz8Ar7zyCgMHDmTQoEEA9OnTh02bNmUGrgceeIAvf/nLNDQ0cPzxx7Ny5crMY0cffXRFtWRxSFGSJO2i0s2rr732Wi699FJ++tOf7rKXYludscn1QQcdRJ8+fXY8P+CAAxg8eDC33HIL//qv/8pBBx3EkCFDMt/bs2dPBg4cCMDQoUP54x//mHlsf9jDJUlSjau0Z2rI+VN39Hbtj0o3r16xYgVjx46lZ8+evPWtb233fJ21yXVbt99+O/feey8zZ85k7ty57bZraGjY8Xjjxo1s374989j+MHBJkqQOZW1efcQRR7Bs2TKOOOIIli5tf8u06667LvdNrrP06dOHAw88kJNOOol3v/vd7bYbPXo0ixYtYvz48TzxxBP84z/+Y+ax/WHgynLoWF7sRiuf9j3xxKJLqBmjhro3mSTti6zNqy+99FI+9rGPcd1119G7d+9239sZm1y3Z+bMmdxwww07nn//+9+nd+/eTJ48ecexz372s3zqU5/ilFNOYdCgQRx22GGZx/aHm1e3o+1WPl15ax9JUtfi5tWd77e//S2PP/44EydO3DEXLOtYi73dvNoeLkmSVPeOPfZYjj322A6P7SvvUpQkqQbV8ghUvduXfzYGrixNg2licNFVVM3yE0btcQ2XejJm3hjGzBtTdBmStEd9+/ZlzZo1hq4alFJizZo1met57YlDipIk1ZjDDz+c559/npdeeqnoUpShb9++HH744Xv1HgOXJEk1plevXvu1qrlqj0OKkiRJObOHax9lLRPh0hGSJClLrj1cETE0Is6IiAPz/BxJkqRaVlHgioibIuKRiJhZaZuIGAL8GHg78J8RMTwiekbEHyLi/vKPt4tJkqRur8MhxYg4D2hIKZ0WETdHxHEppWc7agOMAD6fUnq0HL7eBrwE/CCltH8bEuXtnOtZ+OOii6ieQ668sugSasas02YVXYIkqQ5VMoerEZhffnw38C7g2Y7apJTmAkTEBEq9XLOBjwHnRMTpwFLgopTS1v2oPx/jL2RxNwpc1dg5vruY8uYpRZcgSapDlQwpDgBadplcCxxcaZuICOB8YB2wBXgMeF9K6e1AL+DstieKiBkRsSgiFrn+iCRJ6g4qCVwbgX7lxwPbeU9mm1Tyt8CTwAeBJ1NKq8rtFgHHtT1RSmlOSml8Smn88OHDK/4iVbVoLuOYW8xn52DdrfNZd+v8jhvWgeZnmml+prnoMiRJdaaSIcXFlIYRHwXGAr+ppE1E/COwKqX0XeAAYD3wfyPiauAp4G+AL+/3N8jDjy9hIrCYC3cc6spLPqy+4grAoUWA2Y/MBhxalCR1rkoC1+3AgxExAjgLmBYRV6WUZu6hzamUernmR8SnKQWsuykNO34fCOCOlNLPqvdVJEmSalOHgSul9HJENAJnANemlFYDT3TQZkP5pTPanO4p4KT9LVqSJKkrqWil+ZTSOnbehbjPbWpF2+HBrjxcKEmSap97KUqSJOXMwCVJkpSzuti82iFDSZJUpLoIXHutaUO3Cmmjfr286BJqxtLpS4suQZJUhxxSlCRJypmBS5IkKWcGriw3TmAGE4quompWnDeJFedNKrqMmjB14VSmLnTFfUlS53IOV5ZVTzCi6BqqaPPTTxddQs1Yvtb5bJKkzmcPlyRJUs4MXJIkSTkzcEmSJOXMwCVJkpQzA5ckSVLOvEsxy9ums/i/ii6ieg6YMqXoEmrGpONcHkOS1PkMXFk++DUWdqPAdeg/zS66hJrR9I6mokuQJNUhhxQlSZJyZg8X7LZR9aEs4VBgFScXUU7VbXpqGQD93jK64EqKt2xN6VqMHua1kCR1HgNXhotoBKCJDcUWUiUrJ08GYNSvXWV92o+nAbB0+tKCK5Ek1ROHFCVJknJm4JIkScqZgUuSJClnBi5JkqScGbgkSZJyZuCSJEnKmctCZLiR+4suoaqOuu22okuoGbecc0vRJUiS6pCBK0N3WfC0hQue7uSCp5KkIhi4qqjtivVtn0uSpPrkHK4ME7mYiVxcdBlVs+pLs1j1pVlFl1ETmh5uounhpqLLkCTVGQNXhnHMYxzzii6jatY3N7O+ubnoMmrCgmcXsODZBUWXIUmqMwYuSZKknBm4JEmScmbgkiRJypmBS5IkKWcGLkmSpJy5DleGFxlbdAlV1ffEE4suoWaMGjqq6BIkSXXIwJVhDg8UXUJVHf1Dl0FoMX/i/KJLkCTVIYcUJUmScmbgkiRJypmBK0MTg2licNFlVM3yE0ax/ATnLgGMmTeGMfPGFF2GJKnOGLgkSZJyZuCSJEnKmYFLkiQpZwYuSZKknBm4JEmScmbgkiRJylmuK81HxFBgHLAkpfTnPD+rmhZyfdElVNUhV15ZdAk1Y9Zps4ouQZJUhyoKXBFxE3Ai8JOU0lWVtImIIcCPgZ8A10XEe1NKL1VyrqIt5sKiS6iqIedPLbqEmjHlzVOKLkGSVIc6HFKMiPOAhpTSacDIiDiuwjYnAZ9PKV0N3AW8rZJzSZIkdTeVzOFqBFp2/L0beFclbVJKP08pPRoRE4C3A49Ucq6ImBERiyJi0UsvvVTh16iuccxlHHML+ew8rLt1PutuddNmgOZnmml+prnoMiRJdaaSIcUBwAvlx2uBt1XaJiICOB9YB2yp5FwppTnAHIDx48enSr5EtU3kEmD/hxabmvb8vLOsvuIKwKFFgNmPzAYcWpQkda5Kerg2Av3Kjwe2857MNqnkb4EngQ9WeC5JkqRupZLAs5idQ39jgZWVtImIf4yIT5SPHQCsr/BckiRJ3UolQ4q3Aw9GxAjgLGBaRFyVUpq5hzanUgpz8yPi08BTlOZsDcpoJ0mS1K11GLhSSi9HRCNwBnBtSmk18EQHbTaUXzqjzenaaydJktRtVbQOV0ppHTvvLtznNnvTTpIkqbtw0rokSVLOct3ap6tqonuNdI769fKiS6gZS6cvLboESVIdsodLkiQpZwYuSZKknBm4MsxgAjOYUHQZVbPivEmsOG9S0WXUhKkLpzJ1oSvuS5I6l3O4MozYddWLLm/z008XXULNWL7W+WySpM5nD5ckSVLODFySJEk5M3BJkiTlzMAlSZKUMyfNd6KmpsqOSZKk7sXAlWEx04suoaoOmDKl6BJqxqTjXB5DktT5DFwZFvK1okuoqkP/aXbRJdSMpnc0FV2CJKkOOYdLkiQpZwauDIeyhENZUnQZVbPpqWVsempZ0WXUhGVrlrFsjddCktS5HFLMcBGNADSxodhCqmTl5MkAjPq1q6xP+/E0AJZOX1pwJZKkemIPlyRJUs4MXJIkSTkzcEmSJOXMwCVJkpQzA5ckSVLODFySJEk5c1mIDDdyf9ElVNVRt91WdAk145Zzbim6BElSHTJwZVjFyUWXUFX93jK66BJqxuhhXgtJUudzSFGSJClnBq4ME7mYiVxcdBlVs+pLs1j1pVlFl1ETmh5uounhpqLLkCTVGQNXhnHMYxzzii6jatY3N7O+ubnoMmrCgmcXsODZBUWXIUmqMwYuSZKknDlpvmBNTXt+LkmSuj57uCRJknJm4JIkScqZgUuSJClnzuHK8CJjiy6hqvqeeGLRJdSMUUNHFV2CJKkOGbgyzOGBokuoqqN/6DIILeZPnF90CZKkOuSQoiRJUs4MXJIkSTkzcGVoYjBNDC66jKpZfsIolp/g3CWAMfPGMGbemKLLkCTVGQOXJElSzgxckiRJOTNwSZIk5czAJUmSlDMDlyRJUs4MXJIkSTlzpfkMC7m+6BKq6pArryy6hJox65QMHSgAACAASURBVLRZRZcgSapDBq4Mi7mw6BKqasj5U4suoWZMefOUokuQJNWhioYUI+KmiHgkImZW2iYiBkfEnRFxd0T8e0T0joieEfGHiLi//OMKlJIkqdvrMHBFxHlAQ0rpNGBkRBxXYZuPAtellM4EVgMfAE4CfpBSaiz/LK3ml6mWccxlHHOLLqNq1t06n3W3umkzQPMzzTQ/01x0GZKkOlPJkGIj0PK39d3Au4BnO2qTUvpGq9eHA38CTgXOiYjTgaXARSmlra1PFBEzgBkARxxxRKXfo6omcgnQfYYWV19xBeDQIsDsR2YDDi1KkjpXJUOKA4AXyo/XAgfvTZuIOA0YklJ6FHgMeF9K6e1AL+DstidKKc1JKY1PKY0fPnx4xV9EkiSpVlXSw7UR6Fd+PJDskJbZJiKGAl8HJpVfezKl9Hr58SJgt+FJSZKk7qaSwLWY0jDio8BY4DeVtImI3kAzcHlK6ffldv83Iq4GngL+Bvjy/pW/u6amap9RkiRp/1QSuG4HHoyIEcBZwLSIuCqlNHMPbU4FPgW8DfhiRHwRuAGYDXwfCOCOlNLPqvdVuoe2gdEAKUlS19dh4EopvRwRjcAZwLUppdXAEx202UApYN2QccqT9rdoSZKkrqSihU9TSuvYeRfiPreRJEmqR640n6GJDUWXUFWjfr286BJqxtLpNbn0mySpm3PzakmSpJwZuCRJknJm4MowgwnMYELRZVTNivMmseK8SR03rANTF05l6kJX3JckdS7ncGUYsetNmF3e5qefLrqEmrF8rfPZJEmdz8BV47LW4XJtLkmSuhaHFCVJknJm4JIkScqZgUuSJClnBi5JkqScOWk+w2KmF11CVR0wZUrRJdSMSce5PIYkqfMZuDIs5GtFl1BVh/7T7KJLqBlN72gqugRJUh1ySFGSJClnBq4Mh7KEQ1lSdBlVs+mpZWx6alnRZdSEZWuWsWyN10KS1LkcUsxwEY0ANLGh2EKqZOXkyQCM+rWrrE/78TQAlk5fWnAlkqR6Yg+XJElSzgxckiRJOTNwSZIk5cw5XF1Q282r3cxakqTaZg+XJElSzgxckiRJOXNIMcON3F90CVV11G23FV1CzbjlnFuKLkGSVIcMXBlWcXLRJVRVv7eMLrqEmjF6mNdCktT5HFKUJEnKmYErw0QuZiIXF11G1az60ixWfWlW0WXUhKaHm2h6uKnoMiRJdcbAlWEc8xjHvKLLqJr1zc2sb24uuoyasODZBSx4dkHRZUiS6oyBS5IkKWcGLkmSpJwZuCRJknJm4JIkScqZgUuSJClnLnya4UXGFl1CVfU98cSiS6gZo4aOKroESVIdMnBlmMMDRZdQVUf/0GUQWsyfOL/oEiRJdcghRUmSpJwZuCRJknJm4MrQxGCaGFx0GVWz/IRRLD/BuUsAY+aNYcy8MUWXIUmqMwYuSZKknBm4JEmScmbgkiRJypmBS5IkKWcGLkmSpJwZuCRJknLW5Veab2qq/jkXcn31T5qjtteg7fNDrryys0qpebNOm1V0CZKkOtTlA1ceFnNh0SVU1ZDzpxZdQs2Y8uYpRZcgSapDDilKkiTlrKLAFRE3RcQjETGz0jYRMTgi7oyIuyPi3yOid6XnKto45jKOuUWXUTXrbp3PulvdtBmg+Zlmmp9pLroMSVKd6TBwRcR5QENK6TRgZEQcV2GbjwLXpZTOBFYDH6jkXLVgIpcwkUuKLqNqVl9xBauvuKLoMmrC7EdmM/uR2UWXIUmqM5X0cDUCLd0jdwPvqqRNSukbKaV7yseGA3+q8FySJEndSiWBawDwQvnxWuDgvWkTEacBQ1JKj1ZyroiYERGLImLRSy+9VNGXkCRJqmWVBK6NQL/y44HtvCezTUQMBb4OfLLSc6WU5qSUxqeUxg8fPryS7yBJklTTKglci9k59DcWWFlJm/Ik+Wbg8pTS7/fiXJIkSd1KJetw3Q48GBEjgLOAaRFxVUpp5h7anAp8Cngb8MWI+CJwQzvtJEmSurUOA1dK6eWIaATOAK5NKa0GnuigzQZKAeuGtufLaCdJktStVbTSfEppHTvvLtznNnvTrkhNdK8cOOrXy4suoWYsnb606BIkSXXIleYlSZJyZuCSJEnKmYErwwwmMIMJRZdRNSvOm8SK8yYVXUZNmLpwKlMXupm3JKlzVTSHq96M2PWegC6nqWnX5+c//XQhddSi5WudzyZJ6nz2cEmSJOXMwCVJkpQzA5ckSVLODFySJEk5M3BJkiTlzLsUMyxmetElVNUBU6YUXULNmHScy2NIkjqfgSvDQr5WdAlVdeg/zS66hJrR9I6mokuQJNUhhxQlSZJyZg9XhkNZAsAqTi64kur4l4uXAbBu6Ogdx9oujlovlq0pXYvRw0Z30FKSpOoxcGW4iEYAmthQbCFVcubdkwG4dZqrrE/78TQAlk5fWnAlkqR64pCiJElSzgxckiRJOTNwSZIk5czAJUmSlDMDlyRJUs4MXJIkSTlzWYgMN3J/0SVU1d1n3lZ0CTXjlnNuKboESVIdMnBl6C4LnrZoveBpi7YLn9bLQqgueCpJKoJDipIkSTkzcGWYyMVM5OKiy6ia8b+axfhfzSq6jJrQ9HATTQ83FV2GJKnOGLgyjGMe45hXdBlVc8xzzRzzXHPRZdSEBc8uYMGzC4ouQ5JUZwxckiRJOTNwSZIk5czAJUmSlDMDlyRJUs4MXJIkSTlz4dMMLzK26BKqau2QE4suoWaMGjqq6BIkSXXIwJVhDg8UXUJV3fN+l0FoMX/i/KJLkCTVIYcUJUmScmbgkiRJypmBK0MTg2licNFlVM35t4zi/FucuwQwZt4YxswbU3QZkqQ6Y+CSJEnKmZPmBUBT056fS5KkfWcPlyRJUs4MXJIkSTkzcEmSJOXMwCVJkpQzJ81nWMj1RZdQVY+Nv7LoEmrGrNNmFV2CJKkOGbgyLObCokuoqueOnVp0CTVjypunFF2CJKkOOaQoSZKUM3u4MoxjLtB9erpG/ra0YfPe9HRlrcPVHdbman6mGbCnS5LUuQxcGSZyCdB9AtdfLroCcGgRYPYjswEDlySpc1U0pBgRN0XEIxExc2/aRMTBEfFgq+eHRcTzEXF/+Wf4/pUvSZJU+zoMXBFxHtCQUjoNGBkRx1XSJiKGAPOAAa2angJcnVJqLP+8VJ2vIUmSVLsq6eFqBOaXH98NvKvCNtuA84GXW7U7Ffh0RPxXRHw568MiYkZELIqIRS+9ZB6TJEldXyWBawDwQvnxWuDgStqklF5OKW1o0+5OSuHsL4HTIuKktidKKc1JKY1PKY0fPtwRR0mS1PVVErg2Av3Kjwe2855K2gA8nFJ6JaW0DVgC7DY8KUmS1N1UErgWs3MYcSywch/bANwVEYdGRH/gTOCpiiuVJEnqoiKltOcGEW8CHgTuBc4CpgFTUkoz99Dm1JbhxIi4P6XUWH58OnAD8AYwJ6X0r3v67PHjx6dFixbtsb7usDZUV+W1lyRpp4hYnFIan/Vah+twpZRejohG4Azg2pTSauCJDtpsaPVaY6vH/wmcsA/fQZIkqcuqaOHTlNI6dt6FuM9tJEmS6pF7KWaYwQRmMKHoMqrmjLsmccZdk4ouoyZMXTiVqQtdcV+S1Lnc2ifDiF1HTLu8oeueLrqEmrF87fKiS5Ak1SF7uCRJknJm4JIkScqZQ4raZ22XhXCZCEmSstnDJUmSlDMDlyRJUs4cUsywmOlFl1BVvxs5pegSasak41weQ5LU+QxcGRbytaJLqKpFb59ddAk1o+kdTUWXIEmqQw4pSpIk5czAleFQlnAoS4ouo2qGrF3GkLXLii6jJixbs4xla7wWkqTO5ZBihotoBKCJDXtu2EWcefdkAG6d5irr0348DYCl05cWXIkkqZ7YwyVJkpQzA5ckSVLODFySJEk5cw6XqiZrax+3+5EkyR4uSZKk3Bm4JEmScuaQYoYbub/oEqrq7jNvK7qEmnHLObcUXYIkqQ4ZuDKs4uSiS6iqdUNHF11CzRg9zGshSep8DilKkiTlzB6uDBO5GOg+m1iP/9UsoJhNrNvepVj0XYtND5cKcBNrSVJnsocrwzjmMY55RZdRNcc818wxzzUXXUZNWPDsAhY8u6DoMiRJdcbAJUmSlDMDlyRJUs4MXJIkSTkzcEmSJOXMwCVJkpQzl4XI8CJjiy6hqtYOObHoEmrGqKGjii5BklSHDFwZ5vBA0SVU1T3vdxmEFvMnzi+6BElSHXJIUZIkKWcGLkmSpJwZuDI0MZgmBhddRtWcf8sozr/FuUsAY+aNYcy8MUWXIUmqMwYuSZKknBm4JEmScmbgkiRJypnLQqhTNTXt+bkkSd2RPVySJEk5s4dLhcrq4bLXS5LU3Ri4Mizk+qJLqKrHxl9ZdAk1Y9Zps4ouQZJUhwxcGRZzYdElVNVzx04tuoSaMeXNU4ouQZJUh5zDJUmSlDN7uDKMYy7QfXq6Rv62tGFzV+npyvNOxuZnmgF7uiRJncvAlWEilwDdJ3D95aIrgK4TuPI0+5HZgIFLktS5HFKUJEnKWUWBKyJuiohHImLm3rSJiIMj4sFWz3tFxMKIeCgiPrl/pUuSJHUNHQauiDgPaEgpnQaMjIjjKmkTEUOAecCAVk0/ByxOKb0TmBwRg6ryLSRJkmpYJXO4GoH55cd3A+8Cnq2gzQLgfOBHbdpdVn78ADAe+M+9K1n1xu2AJEldXSVDigOAF8qP1wIHV9ImpfRySmnD3p4rImZExKKIWPTSSy9VUJ4kSVJtqyRwbQT6lR8PbOc9lbSpqF1KaU5KaXxKafzw4cMrKE+SJKm2VTKkuJjSEOGjwFjgN/vYpnW728rtHt3LejtFE2075rq2W6ctL7qEqtqf/ReXTl9azVIkSapIJYHrduDBiBgBnAVMi4irUkoz99Dm1HbONQ/4j4h4N3Ai8Mt9L12SJKlr6HBIMaX0MqXJ7o8Cp6eUnmgTtrLabGj1WmOrx78HzgAeAt6XUtq2/19BkiSptlW00nxKaR0770Lc5zbldi9W0q5IM5gAwBweKLiS6jjjrkkA3PP+BQVXUrypC0ur7c+fWNP/CkqSuhm39skwgieKLqGqhq57uugSasbytd1rPpskqWtwax9JkqScGbgkSZJyZuCSJEnKmYFLkiQpZ06aV7fgfouSpFpm4MqwmOlFl1BVvxs5pegSasak4yYVXYIkqQ4ZuDIs5GtFl1BVi94+u+gSakbTO5qKLkGSVIecwyVJkpQzA1eGQ1nCoSwpuoyqGbJ2GUPWLiu6jJqwbM0ylq3xWkiSOpdDihkuohGAJjbsuWEXcebdkwG4dZqrrE/78TQAlk5fWnAlkqR6YuBSt+Rdi5KkWuKQoiRJUs4MXJIkSTkzcKkuNDXtOqzoEKMkqTMZuCRJknJm4JIkScqZdylmuJH7iy6hqu4+87aiS6gZ733hlqJLkCTVIQNXhlWcXHQJVbVu6OiiS6gZQ97YeS1cOkKS1FkcUpQkScqZgSvDRC5mIhcXXUbVjP/VLMb/albRZdSExcOaWDysqegyJEl1xiHFDOOYB8BCvlZwJdVxzHPNACx6++yCKyneyjctAGDcmqbdXnOIUZKUF3u4JEmScmbgkiRJypmBS5IkKWcGLkmSpJw5aV5qR9akeSfSS5L2hYErw4uMLbqEqlo75MSiS6gZB7w+ar/e752MkqR9YeDKMIcHii6hqu55/4KiS6gZf/Xi/KJLkCTVIedwSZIk5czAJUmSlDMDV4YmBtPE4KLLqJrzbxnF+bfs39yl7mLB0WNYcPSYosuQJNUZA5ckSVLOnDQvVZFLSUiSshi4pP1gmJIkVcIhRUmSpJwZuCRJknJm4JIkScqZc7gyLOT6okuoqsfGX1l0CTXj5D/P6vTPdDsgSZKBK8NiLiy6hKp67tipRZdQM0a+MqXoEiRJdcjAJXUye7wkqf44hyvDOOYyjrlFl1E1I387n5G/ddNmgOcGNfPcoOaiy5Ak1Rl7uDJM5BKg+wwt/uWiKwCHFgGWHDgbcGhRktS57OGSJEnKmT1cUsEqmcPlPC9J6toq6uGKiJsi4pGImLk3bdoei4ieEfGHiLi//DNm/7+CJElSbeswcEXEeUBDSuk0YGREHFdJm3bedxLwg5RSY/lnaXW/jiRJUu2pZEixEWi5xe1u4F3AsxW0OTnjWD/gnIg4HVgKXJRS2rqPtUt1w6UkJKlrqyRwDQBeKD9eC7ytwjZZx+4F3pdSWhUR3wXOBu5ofaKImAHMADjiiCMq/iJSPXHelyR1LZUEro2UeqYABpI9DJnVJuvYkyml18vHFgG7DU+mlOYAcwDGjx+fKqiv6prYUMTH5ubWacuLLqFmTFrhKLYkqfNVErgWUxoOfBQYC/ymwjbPZxz7vxFxNfAU8DfAl/e2YP+rXZIkdTWVBK7bgQcjYgRwFjAtIq5KKc3cQ5tTgZRx7Eng+0AAd6SUfla9ryKptUrmfTk3TJI6R4eBK6X0ckQ0AmcA16aUVgNPdNBmA0DGsQ2U7lSsaTOYAMAcHii4kuo4465JANzz/gUFV1K8e0eUVtv/qxfd6kiS1HkqWvg0pbSOnXccVtymkvfVohG75skub+i6p4suoWas71O/89nsvZKk4rjSvKQdHGKUpHy4l6IkSVLODFySJEk5c0hR0n5xGFKSOmYPlyRJUs7s4cqwmOlFl1BVvxs5pegSasZRL08quoQupZK1uyRJHTNwZVjI14ouoaoWvX120SXUjHFrmoouQZJUhwxckvbKvvRwOc9LUr0zcGU4lCUArOLkgiupjiFrlwGwbujogisp3rrepWsx5A2vRV4MU5K0OwNXhotoBKCJDcUWUiVn3j0ZgFun1e8q6y3uO2waAJNWLC24EklSPTFwSep0DjFKqjcGLkk1x7sjJXU3Bi5JhavGRPxqnVeS8mDgktQleHekpK7MleYlSZJyZg+XpLrhMKSkohi4MtzI/UWXUFV3n3lb0SXUjPe+cEvRJUiS6pCBK0N3WfC0hQue7uSCp9pblcwDc66YpI4YuCSpygxgktoycGWYyMVA99nEevyvZgFuYg2weFgT4CbWal9H4cjwJGlfeJdihnHMYxzzii6jao55rpljnmsuuoyasPJNC1j5pgVFlyFJqjP2cElSARx2lOqLgUuScpbXchSGNKnrMHBJUjdmT5pUGwxcktRFGaakvbd123Y2bdnG5i3b2bxlW/nxNja9sfPx5i2lNi3HXi+329Tqtc3l17ZuSxx6QF+OPnDAHj/XwCVJdcShS9WilBJvbNvO5je2s3nrzqCzMwBtY9Mb29s831Zu2+Z4y2utg1Or17dsS/tUY79eDfTr3UDfnj3o27uh9LxXAz16BItWruOOJ17c4/sNXBleZGzRJVTV2iEnFl1CzTjg9VFFlyDlprOC0b7MSatkwVjVnu3bE69v3TW0ZIaZHeFnZ1B6fcv2XXqNdv29fcd5Wo5v34cc1NAj6Nergb69GujXuwd9e5ZDUa8GBvfvzSG9euwISn3Kr7UEpb69epTf17DL+1pea2nbt1cDfXr2ICL2WMvmLdvod037rxu4MszhgaJLqKp73u8yCC3+6sX5RZcgdTl5BaOOQtm+rOpfLyFu67btbN66vVX42bbLENhuw2WthsDaD0rbdxk62/TGNl7fun2f6uvdswd9e+4aWlrCzEGDeu0MO70aWgWmlnY9dh5vFYZ2CUrl570aamd1q769Gvb4uoFLklR1eQ1dVuO8eYWyHcNiLWEno3dnc5ten51BqTSUtrnNUNqmLdtbBaWdv6s5LNa3VwP9e/dk6ICdr+0elHpw950N9IwGGmigJw3M+FRLGOqxS6Dq26uBhh577g2qRwYuSaoRKe38S7Tto9bP2/5V2/ZI2u3dlbTMenUPNbQpYveqdh5r/zy7H+uohtbvyGq5aGXpUUt9KSVWb6dVvYmHf7vzHdtTYu53t7OVbWxL29jKNt57RquJ0bvMFdoZeJ77fek9W9M2trGNht6l16o+LNavF4e8qU/msFhLT1A1h8X2ZOU9uz7/y6P2+VR1ycCVoYnB5d8bCq6kOs6/pTRv6dZpywuupHgLjh4DwKQVSwuuRJXamraxMW3i1fLPxvLPq2kTm3i9Tbho9XiXv/g6Cho7/7f951nnqV7YUXX8xzc7bnPXt/f8+i/vLP3uQQ960oOeNNAQpV6dBkq9PP2iFw30oGeUe322NdCzR+seoB5MOS97WKwlKF3/v0vn7BHlYbGt5Z9WqjFfLq/5c/U6nLuvDFySCpNS4nW2lAPUa2xMm8uh6jVeTZvZmDbxOm/s8p4g6E9fBkY/hsab6EG0eXXHw9LzaP3qzv/NetTOmTp8XnnLPX1m+XmrL9C2L2J/amjdpqMaWr/SUQ2lmne/fu1+vz2dZw/v2PtrsYdvtcu/E7EjILUeLmuggR770RsE8OQde369dwWnLyrEFHkDRlcKbntTq4FLUm62p+28xuYdPVJtf7+aNrOVbbu8pycNDIh+DIx+DOsxmIHRjwHRl4HRnwHRj/702dkjIGk3ec2F2xf70gtWjRsjqnUn7b6ctz0GLkn7bEvautswX+tA9Rqbdxs260tvBkQ/DohBHNbjoHKg6rfjdx967dc8E0nFqOWgl9d79oaBS1KmlBKbeWO3QNU6WL3Oll3eEwQDoi8D6MchPYbtEqRafveMPd86LUmtdaUhxj0xcEl1alvazmvleVKvkh2otrHrGjw9aWBgOTwN73HAboGqH333e96LJHVHBi6pm3ojbcmYN7V5x4T019i823v60YcB5cnof9HjYAZGXwZE/x1zqHrT0+E+SdoHBq4MC7m+6BKq6rHxVxZdQs04+c+zii6hKlJKbOL1zHlTLb/faHN/eQ+CAeXeqBE9DmQAu/ZO9Y++DvdJUk4MXBkWc2HRJVTVc8dOLbqEmjHylSlFl1CRbWnbjmURdoQodu2p2t5muK8XPXcEqIN7DN3xeOdwXx97pySpIAYuqZOllHiDrZm9Uq0X9Gyrf3m4b1gM5sgeh+w2Gb139Crg20iSKmHgyjCOuUD36eka+dvShs32dMFzg5qBfHu6tpeH+0oLee46b6olUG3Zbbivx47gdHiP4QyI/qXnlOZO9Y++NLj2lCR1WQauDBO5BOg+gesvF10BGLgAlhw4G9i/wLU1bcueN1W+0++1tJntbVaf6kMvBkQ/BkV/Ds1YLqEvvR3uk6RuzMAltdKy1Uzb5RFaB6vNu201A/3py4Dox0ExhAE9+u0WqHqF/1eTpHrm3wKqS3/cvjZz3amNadNuW800tBruO6LHm3YJUwMobTvjVjOSpD0xcKlbadlqZud8qdYbIm+iJRbd+cYjO97Th94MjH68KQYwosfw8ppT/crzqPrSx+E+SdJ+qunAte61N/jhfz1PQ4+gZ48eNPQI/ntbEAQ9KP+OHrs+3/G7fDxaH++xWzv/Iu06SsN9b+zSM7Vr79RmXt9tuC/oTylAHdxjKC+Vj5/R6+3lJRP6OtwnScpdRX/TRMRNwInAT1JKV1XaptJj7Xl+3SY+P/+JSr/LPgnYLYg19S291rz5vjaBLSO4RXaQ29Guove3HwjbP39WoGx1vNX7W7yRtuzyWQE1FTi3p+28mja3WW9q12DV3lYzA6Ifw3ocsGPbmdZrT7Ue7ltQ/n1Yw/BO/GaSpHrXYeCKiPOAhpTSaRFxc0Qcl1J6tqM2wJhKjrU9V2vHHzKIhf/QyNbtiW3bE1u3Jb7xzURiO9tJJBLbU/l3+ciO4zteb+d4S/u0+/EWhzYMyzx/y6PtKbEtbWc721qdI6u2NsfLv1ObO9ny8sny7++/fvdurxUZBreTeK1VoNrE5t2uSN/ycN+QGMRf9Dhot8novelVU6FRkqQslfRwNQLzy4/vBt4FtA1JWW1OrvBY2/A2A5hRfvr6UQcOfKqCGqtq51/fb+3sj87FiS0P/tc5rQ8fCPy504upEU+RW0ir6+uaM69tPryu+fHa5qOWr+uR7b1QSeAaALxQfrwWeFuFbSo9touU0hxgDkBELEopja+gRu0lr20+vK758drmw+uaH69tPrrqda3kXvaNQL/y44HtvCerTaXHJEmSurVKAs9iSkN/AGOBlRW2qfSYJElSt1bJkOLtwIMRMQI4C5gWEVellGbuoc2pQKrw2J7M2atvo73htc2H1zU/Xtt8eF3z47XNR5e8rpFSx3fKRcQQ4AzggZTS6krbVHpMkiSpO6socEmSJGnfOWldkiQpZzUbuCLipoh4JCJmdtxaexIRgyPizoi4OyL+PSJ6e32rJyIOjogl5cde1yqKiG9ExMTyY69tFUTEkIj4j4hYFBE3lo95bfdD+c+AB8uPe0XEwoh4KCI+2d4xVabNtT0iIu6PiPsiYk6UdJlrW5OBq/XK9cDI8ir12ncfBa5LKZ0JrAam4fWtpq8C/fz3troi4t3AISmlhV7bqvo48G/ldYwGRcSleG33WXle8jxK60wCfA5YnFJ6JzA5Iga1c0wdyLi2FwGfTSm9F/gLSrvXdJlrW5OBi+yV67WPUkrfSCndU346HPgYXt+qiIj3Aq9SCrKNeF2rIiJ6Ad8CVkbEuXhtq2kN8JaIOIDSX1pH47XdH9uA84GXy88b2Xk9HwDGt3NMHdvl2qaUvphSWl5+bRil1eYb6SLXtlYDV9sV6Q8usJZuIyJOA4YA/43Xd79FRG/gS8Bl5UP+e1s9nwCeBq4F3g78LV7bavkFpe1HLgaWA73x2u6zlNLLKaUNrQ5l/Tngnw37IOPaAhAR5wPLUkov0oWuba0GLlekr7KIGAp8ndJe1l7f6rgM+EZKaX35ude1ek4G5pSXjvkepf9y9dpWxxXAZ1JKs4FfAx/Ba1tN7rKSo4gYCfw9cEn5UJe5trVamCvSV1G5J6YZuDyl9Hu8vtXyPuBvI+J+Sjud6clf4AAABFpJREFUT8TrWi2/BUaWH48HjsJrWy1DgDER0QCcAlyD17aa3GUlJ+U5XT8APtmq56vLXNtKVpovQtbK9dp3n6K0UfgXI+KLwFzg417f/ZNSmtDyuBy6Poj/3lbLTcDNETEN6EVpnsYdXtuq+AqlPwOOBB4B/hn/va2mecB/lG/6OBH4JaUhr7bHtPcuA44Avh4RUOqtzbreNalmFz51Rfp8eX3z4XXNj9c2P17b6iqH13cBd7X0xGQdU3V0lWtbs4FLkiSpu6jVOVySpP+/vfsJlaoM4zj+/YmZi/5wqcCoKKnoVmq1izRIjWjRQiRFjDZtEkkXYnDFqLiULqobCVGL/kBR0O3PQoJ7i8KIihKSQooQjAILirBLQZn36q/F+w4cJu0KdaAz/T4wMDNnzjtzZjE887wPzxMRAyMBV0RERETLEnBFRGdIuqaO85hTm6M2j82T9JffNEnzJZ0xy7pD//ZnjYhoSsAVEV2yBngQWAJM1vmgU5LeBiaBYUk7a6+enrXAS7Os+7KkE3aoljQqabmkhyWNSDpT0lu1rUJExClJ0XxEdEbNYD1ACbrOs/2jpI9tXy9pMaWR58WUAGsNZarCZH3+VdsfNNb5Cdh3kre6jtKxej6wgdKNfSHlT+rjwKjt1b2Mmu3jLVxuRAyQBFwR0QmSTrM9Xe9fALxge2Uj4DoArLB9SNKQ7Z8lbaF0on4R2E3psH6wBkrPUEYGHXHfD6GkXcAWStB1T719BnwEnA3cAEwBlwGrbO9t/xuIiC7LlmJEdMUOSfsl7bT9HXBE0jCApKWUbNXtkj4HNkm6A1gNfGn7d0qmalzSCtvHbd8FPALskfRevU1J2m17s+0ZyvDcS4DHgEPAAkp39u3A3cB4gq2IOBXJcEVEZ0haRmnQOQoMA98C7wDrKPPUDgA3UjrT7wd+BZYCT1A6Uj8JfGX7m751LwXuBz4EnqvBFpIupMxnu5MyAWMa2AHspXRpv8L2rrauNyIGx391tE9ExMkY2AisqvevpGwPQhly/T6A7ddrgHYrpch+zPYegFogPwb0aq/OBc6i1H+tV5kb8ihwGHgIuJwyp+1aSi3XG/W8idauMiIGSgKuiOiacyjZ+ZUAtYbr5t5BSTcB8yQ9C7wCfGJ7c98anwLLbR+r59wGLLM9Uh/PBebYPippnDJfcAL4wva0pH2UgG+0zQuNiMGRGq6I6JJFwC3A+N+8ZgjYRNny+w34pXdA0jZJ57s41jhHlGwZALZnbB+tD58G3gXuAxZJWghcDfxBGQofETGrZLgiokvepBSqH24819/U9Htgre0JSRcB6+rWYq9R6ljzxZI2APcCI/1vVhuiPgV8TclyXQU8D2wFfgBek7Te9sF/fGURMdBSNB8R/2uSTgdm+jJezeNzG0X0omw19rYi1d9SIiLiRBJwRURERLQsNVwRERERLUvAFREREdGyBFwRERERLUvAFREREdGyBFwRERERLfsTUCuvvCTrZfgAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 720x576 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "fig, ax = plt.subplots(figsize=(10,8))\n",
    "sn.distplot(comments['comment_length'], bins=comments['comment_length'].max(),\n",
    "            hist_kws={\"alpha\": 0.5, \"color\": \"blue\"}, ax=ax)\n",
    "ax.set_xlim(left=0, right=np.percentile(comments['comment_length'], 95))\n",
    "ax.set_xlabel('评论字数')\n",
    "ymax = 0.04\n",
    "plt.ylim(0, ymax)\n",
    "ax.plot([mode, mode], [0, ymax], '--', label=f'mode = {mode:.2f}', linewidth=2)\n",
    "ax.plot([mean, mean], [0, ymax], '--', label=f'mean = {mean:.2f}', linewidth=2)\n",
    "ax.plot([median, median], [0, ymax], '--',\n",
    "        label=f'median = {median:.2f}', linewidth=2)\n",
    "ax.set_title('影评的字数分布', fontsize=16)\n",
    "plt.legend()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3.0"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.percentile(comments['comment_length'], 5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 248396 entries, 0 to 261496\n",
      "Data columns (total 4 columns):\n",
      "comment            248396 non-null object\n",
      "star               248396 non-null int64\n",
      "cleaned_comment    248396 non-null object\n",
      "comment_length     248396 non-null int64\n",
      "dtypes: int64(2), object(2)\n",
      "memory usage: 9.5+ MB\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "comments = comments[comments['comment_length']>3]\n",
    "print(comments.info())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**字数最少（4个字）的评论的的评分分布**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "8947"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(comments[comments['comment_length']==4])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/yangbin7/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:2: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n",
      "  \n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXgAAAEGCAYAAABvtY4XAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAZb0lEQVR4nO3dfZBU9Z3v8feHJ9EhI4hcJoMYNGsM2fAQnXiZDUkIqxYIUcvc1bWyYVMJl5WI1xR31xBJMIJ5qNQVdFFg3VhRCWoEFc0iPhB11IDRYXyA1UupC5OgECEICJeAId/7xznAPDI9zXT3eObzquqyz69/3ed7xplP//p3fqdRRGBmZtnTrdQFmJlZYTjgzcwyygFvZpZRDngzs4xywJuZZZQD3uwYSVILbT1KUYtZQw546/QknSypW3p/iKRPpPd7NA1SJY5r5XU+mf73dEljW+kzTVJvSU9KGibpXySVS/o3SV9opcRRkp5o0lYj6aw2jqvNepr07yapRtLH2uprBg546wQkzZP0n5KeTm8vS/r39LF+wNPA5Wn304FHJZ0KfBOolbRW0nuSNgBrgbWSPtJkHxOAZeloO4DbJJ3QQjk9gO8BfwbKgL+PiN3AWGBzK4cwDPhNg331BU4GXjrKMedaT0PjgRMior6NfmZA8stsVmp/IRlsHPp97A4ckPRXwALg6oj4NUBEPCnp+8BngX8HfhkROyUtB24CaoBhEfH+oRdPR//XATMjubJvo6RH0v5TGvTrDbxDErj9gDHAM+nIvU9E/Ffa77iI2J/e/1Vayz5JFwM3AOVAX+CNdPamIiL65FFPNXAPsIfkDecTwBZJLzf42Sn9uV0ZEU/n/iO3rkC+ktVKTdJo4AygKm16Efg90AuoA3YC3weuB/6chiJpoE6KiEsaBPwHwL3AJyJiX9rvGuC8iDivwT77AM8DjwH/HBEhqRz4EXAOSWgvBnYAA0lG8L8HTgXeB0ZGxPuSXgP+Jn2TuQF4C/g6MCUiNqT7eisiPt5g3znV0+RndDnwjYg4T9LPgP8VEf8vjx+3dSGeorGSkvQtYC7wj8DfAueRBOQPgc9ExB+AecAfSAL+/0r6IA3Eh4Dhh+ayU1cCP2oQ7n8LfJsGI2OAiNhDEtrnkozSzwT2Af8FvA08S/LG8h7wceCWiKgCniSZtjn0CeEvTQ7pi8nLJ+HetE876zn0nCHAnAbP+RLJpwyzo/IUjZVURCyQ9DpwdpOH1kfEo5KuAnpFxPy0/VpJm4D9JAOUfwS2NXjefwAPSOoOnADcRjL6XyPpeJIpjd1p35NIwvazwEHgFGA7cA0wm2Qa5WKgZ4P6TiUZpbfmBeBnkm6PiG+mbX8BSM8LtKceJH0UWAF8DHgwnfI5BXheUqT1fC0iVhylJuuiPEVjJSfpRuA4oDZt+iTwEZLpkeHA/yAJv3fTqZRNwF8BjwADSMLw48C7JNMnvYClETFHUq+IOJDu56fA2xFxc7r9KMlo/5l0+2zgZyTz591IpmQ+AC4gOdE7DngmIoY1qH09yWj6IFABzIiIOyQ9D1wXEY9J2hARZ6b921PPCOAB4Gbgmog4JW1/E/h0RPxJ0h3AkohouorHzFM01ikcJJl2mAJMJVktIuAW4Cskgf0EScAe0h+4ISI+k06d1AD/FBFVETE8IuYAHArT1BeB3zbYHkSDlTERsRYYDewiOaH5U2BDOt3zALCcZIqmqc9HxEiSN4dDbgQmpSdu9zTYR871pHVcGxH/2sI+G/IozVrkgLfO4MfA10gC9XXgX4Dp6fz78SRTFI9GxMq0fw9gJfDfct2BpIuA7hHxfIPmj5LMtzdUSfJJ4lHg30iWXUIS8F8AfpXjLh8EvgGcyJEpmHbVExGbIuKX6Wazv1VJvUje6P6cY03WxTjgraTSde7LSKZlupHMV38a+N+SRgKrgRURcU3a/0ySOfF/iohlDV8qvbW0j4tIlltOTbfLJf0N8KdDyx0beJNkFc5xwEzgryVdAvwS+GeS9eqfb9C/J/BsunRxcrpNehwi+dTx5jHUc0jDi7d6pPvZQDIH/2orz8mZpCskbZVUK+m0Nvp+VdIqSUslnXGs+7YCigjffCvZDRhKsnLkNGArMDltv5wkGMe28JwTW2h7FBjXQvvNwH8CZzdouwZYT7LEsmHfE4CnSFbtVKRt/wCsIll2Ccna+CeAsnR7MnBcen9Eg34fJRn1XwOU51NPk9qOa3D/bZJ1+j076P/BsPQ1K4FqkjfU1vp+juQTzlDgKmBNqX+HfGv95pOs1mlI6hPJcsFDFx11j4i9x/iaHyEZGX/QETUeq85WD0B64dgJEfHddPu3JG+szX72ki4A/hARa9OlqvUR0b+4FVuuvEzSOo1D4Z7e/1MHveb7bfcqns5WT+oUklVCh7xDsizztaYdI+IRgHSJ5/dJptesk3LAm1l3Gp8I3kuyVPRo7gImkkzZWCflk6xm9h6NA/14ml+h20hE/B1wIUe+MM06oU4zB3/yySfHkCFDSl2GWZezY8cO9uzZw6mnnkpEsH79es4880x69erVrO/7779P79696dkzWSz08ssvM2zYMLp3717ssi21du3a7RExoKXHOs0UzZAhQ6itrW27o5l1qN27dzN06FCuvfZa1q9fj6RW/xZnz57Nhg0buOuuu3jyySf59re/zUsvtfqtyFYEklr9+mhP0Zh1ceXl5Sxbtoy5c+dSU1PDkiVLWL16NRMmTGjW95prruHgwYNUVlYyc+ZM7r333hJUbLnqNFM0VVVV4RG8mVn7SFobydd1NOMRvJlZRjngzcwyygFvZpZRDngzs4xywJuZZZQD3swsoxzwZmYZ1WmuZDWz9hsyIxv/1vamnzS/qMqOnUfwZmYZ5YA3M8soB7yZWUY54M3MMsoBb2aWUQ54M7OMcsCbmWWUA966rEWLFlFRUUFVVRUbN25std+BAwe49NJL6d27N0OGDGHVqlUAXHTRRVRUVBy+9erVi2eeeaZY5Zu1yRc6WZe0bt065syZQ11dHfX19UybNo0VK1q+aOjGG2+kZ8+ebN26lYcffpipU6fyxhtv8NBDDx3us2PHDqqrqznnnHOKdQhmbfII3rqk5cuXM2nSJCorK6murmb79u3s3bu3xb4jRoxgwYIF9O3bl4kTJ7J58+ZmfW666Sa+9a1v0bt370KXbpYzj+CtS9q8eTNjxow5vF1ZWUl9fT2f+tSnmvW94IILDt9ftWoVo0aNavT4gQMHWLJkCa+88krB6jXLhwPeuqSDBw9SXl5+eLusrIydO3ce9Tn79+/nuuuuY968eY3a77//fs4//3z69OlTkFrN8uUpGuuS+vXr1yjQ9+3bR7duR/9zmDVrFiNHjmTcuHGN2hcvXszll19ekDrNjkWbAS/pREkrJT0u6UFJvST9TtLT6W1Y2u96SS9KurXBc5u1mXUGVVVVrFmzBoCIoK6ujkGDBrXaf+XKlSxdupSFCxc2at+1axevvPIKn/vc5wpar1k+chnBfxWYGxHnA1uBGcA9ETEmva2TdDYwGjgHeFfSuS21FegYzNpt/PjxPPjggzzwwAPMmTOH/v37M3jw4Bb7vvnmm0yaNIl7772Xvn37Nnrs17/+NdXV1XTv3r0YZZu1S5sBHxELIuKJdHMA8GdgoqQXJN0uqQfwReD+iAjgMeDzrbQ1ImmKpFpJtdu2beugQzJrW3l5OcuWLWPu3LnU1NSwZMkSVq9ezYQJzb+X/NZbb2XXrl1ceOGFh9e8b9myBYCamppmJ13NOgsl+ZtDR6kauIFkBL85IrZIugtYBowAXo2IhyR9ApgOvN20LSKuaO31q6qqora29hgPx6xr8T/4YZLWRkRVS4/ltIpG0knAfOArwNaI2J8+VAucAewBjk/b+pB8MmipzczMiiSXk6y9gKXAdyOiHlgsaYSk7sDFwCvAWpL5dkhG85taaTMzsyLJZQT/TeAsYKakmcBTwGJAwMMRsUpSN+DHkm4GxqW3+hbazMysSNoM+IhYCCxs0nx9kz5/SVfJTABujoiNAC21mZlZcXTYlawRsY/khOtR28zMrDh84tPMLKMc8GZmGeUvG7MPtaysAwevBbeO5xG8mVlGOeDNzDLKAW9mllEOeDOzjHLAm5lllAPezCyjHPBmZhnlgDczyygHvJlZRjngzcwyygFvZpZRDngzs4xywJuZZZQD3swsoxzwZmYZ5YA3M8soB7yZWUY54M3MMsoBb2aWUQ54M7OMcsCbmWWUA97MupxFixZRUVFBVVUVGzdubLXfgQMHuPTSS+nduzdDhgxh1apVjR6fPHkyd9xxR4GrzZ8D3sy6lHXr1jFnzhzq6uqYP38+06ZNa7XvjTfeSM+ePdm6dSuzZ89m6tSphx+bNWsWixcvLkbJeXPAm1mXsnz5ciZNmkRlZSXV1dVs376dvXv3tth3xIgRLFiwgL59+zJx4kQ2b94MwGuvvcaOHTu47LLLill6uzngzaxL2bx5M8OHDz+8XVlZSX19fYt9L7jgAk488UQAVq1axahRowAYOnQot9xyC926de4I7VHqAszMiungwYOUl5cf3i4rK2Pnzp1Hfc7+/fu57rrrmDdvHgCSClpjR2nz7UfSiZJWSnpc0oOSekm6XdIaSd9r0C+nNjOzUurXr1+jQN+3b1+bI/FZs2YxcuRIxo0bV+jyOlQuny++CsyNiPOBrcDfA90joho4XdIZki7Jpa1QB2FmlquqqirWrFkDQERQV1fHoEGDWu2/cuVKli5dysKFC4tVYodpc4omIhY02BwA/ANwU7r9ODAa+AxwXw5tbzR8bUlTgCkAp556al4HYGbWHuPHj2f69OmMHTuW9evX079/fwYPHtxi3zfffJNJkyaxYsUK+vbtW+RKj13OZwgkVQP9gN8Db6fNO4CBQFmObY1ExG0RURURVQMGDMjrAMzM2qO8vJxly5Yxd+5campqWLJkCatXr2bChAnN+t56663s2rWLCy+8kIqKCioqKtiyZUsJqs5PTidZJZ0EzAe+AkwHjk8f6kPyJrEnxzYzs5Krrq7mueeea9S2YsWKZv3mzZt3+MRqSzrzRU6Q20nWXsBS4LsRUQ+sJZluARgBbGpHm5mZFUkuI/hvAmcBMyXNBH4OfE1SJTAeGAUE8GwObWZmViRtjuAjYmFE9IuIMentTmAM8DzwpYjYFRG7c2kr1EGYmVlzeV3oFBHvcWSFTLvazMysOHzi08wsoxzwZmYZ5YA3M8sof9mYmX0oDZnRfN36h9WmnzS/yKojeARvZpZRDngzs4xywJuZZZQD3swsoxzwZmYZ5YA3M8soB7yZWUY54M3MMsoBb2aWUQ54M7OMcsCbmWWUA97MLKMc8GZmGeWANzPLKAe8mVlGOeDNzDLKAW9mllEOeDOzjHLAm5lllAPezCyjHPBmZhnlgDczyygHvJlZRjngzcwyygFvZpZROQW8pIGSnk3vD5K0WdLT6W1A2n67pDWSvtfgec3azMysONoMeEn9gDuBsrTpvwM/jIgx6W2bpEuA7hFRDZwu6YyW2gp1EGZm1lwuI/iDwGXA7nR7FDBZUp2kH6VtY4D70vuPA6NbaWtE0hRJtZJqt23bltcBmJlZy9oM+IjYHRG7GjStJAnvzwLVkoaTjO7fTh/fAQxspa3pa98WEVURUTVgwIC8D8LMzJrrkcdzVkfEfgBJLwFnAHuA49PH+5C8cbTUZmZmRZJP6D4m6aOSTgDOB9YDazkyBTMC2NRKm5mZFUk+I/jrgaeAA8CiiNggaQvwrKRKYDzJPH200GZmZkWSc8BHxJj0v08Bn2zy2G5JY4DzgJ8emrNvqc3MzIojnxF8iyLiPY6smmm1zczMisMnPs3MMsoBb2aWUQ54M7OMcsCbmWWUA97MLKMc8GZmGeWANzPLKAe8mVlGOeDNzDLKAW9mllEOeDOzjHLAm5lllAO+i1u0aBEVFRVUVVWxcePGNvtPnjyZO+64o1HbVVddRVlZGQMHDuTuu+8uUKVm1l4O+C5s3bp1zJkzh7q6OubPn8+0adOO2n/WrFksXry4Uds999zD66+/zltvvcV9993HFVdcwf79+wtZtpnlqMO+Ltg+fJYvX86kSZOorKyksrKS7du3s3fvXsrKypr1fe2119ixYweXXXZZo/bBgwfz85//nIqKCioqKpDEzp07GTiw2T/Ba2ZF5hF8F7Z582aGDx9+eLuyspL6+voW+w4dOpRbbrmFbt0a/8qMHj2awYMHA/DCCy9w0kknOdzNOgkHfBd28OBBysvLD2+XlZWxc+fOFvtKavP1ZsyYwXe+850Oq8/Mjo0Dvgvr169fo0Dft29fsxF6rhYuXMiBAweYMmVKR5VnZsfIAd+FVVVVsWbNGgAigrq6OgYNGtTu13n55ZeZPXs2ixcvzvsNwsw6nk+ydmHjx49n+vTpjB07lvXr19O/f//D8+m5+uMf/8iXv/xlFixYwGmnnVagSs0sHx5udWHl5eUsW7aMuXPnUlNTw5IlS1i9ejUTJkzI+TV+8Ytf8M477zB16tTDK2leeOGFAlZtZrnyCL6BRYsW8YMf/IBTTjmFpUuXtjkinTx5MqNHj+brX/96cQosgOrqap577rlGbStWrGi1f9OLnK6++mquvvrqQpRmZsfII/hUR1z0Y2bWmXgEn+qIi37MzDoTj+BTHXHRj5lZZ+KESnX0RT9mZqXmgE915EU/ZmadgRMs1VEX/ZiZdRY+yZrqiIt+SmXIjNaXNX7YbPpJ7mvwzezochrBSxoo6dn0fk9Jv5L0G0nfaE9bZ9YRF/2YmXUmbY7gJfUD7gQOrRe8ClgbET+Q9IikpcD/zKUtIt4v1IF0hGO96MfMrDPJZQR/ELgM2J1ujwHuS+8/A1S1o83MzIqkzYCPiN0RsatBUxnwdnp/BzCwHW2NSJoiqVZS7bZt2/I7AjMza1E+q2j2AMen9/ukr5FrWyMRcVtEVEVE1YABA/IoxczMWpNPwK8FRqf3RwCb2tFmZmZFks8yyTuBRyR9HvgU8FuSqZhc2szMrEhyHsFHxJj0v/XAecBvgHMj4mCubR1dvJmZtS6vC50i4h2OrJBpV5uZmRVHZq5kzcrVnL6S08w6ir+LxswsoxzwZmYZ5YA3M8soB7yZWUY54M3MMsoBb2aWUQ54M7OMcsCbmWWUA97MLKMc8GZmGeWANzPLKAe8mVlGOeDNzDLKAW9mllEOeDOzjHLAm5lllAPezCyjHPBmZhnlgDczyygHvJlZRjngzcwyygFvZpZRDngzs4xywJuZZZQD3swsoxzwZmYZ5YA3M8soB7yZWUa1O+Al9ZD0O0lPp7dhkq6X9KKkWxv0a9ZmZmbFk88IfjhwT0SMiYgxQC9gNHAO8K6kcyWd3bStowo2M7Pc9MjjOaOAiZK+BKwDNgD3R0RIegwYD+xqoW1VRxVtZmZty2cE/yJwbkScA/QEjgfeTh/bAQwEylpoa0bSFEm1kmq3bduWRylmZtaafAL+1YjYkt6vBfaQhDxAn/Q1W2prJiJui4iqiKgaMGBAHqWYmVlr8gn4xZJGSOoOXEwyWh+dPjYC2ASsbaHNzMyKKJ85+NnA3YCAh4EbgGcl3QyMS2/1wI+btJmZWRG1O+AjYj3JSprD0lUyE4CbI2Jja21mZlY8+Yzgm4mIfcCyttrMzKx4fCWrmVlGOeDNzDLKAW9mllEOeDOzjHLAm5lllAPezCyjHPBmZhnlgDczyygHvJlZRjngzcwyygFvZpZRDngzs4xywJuZZZQD3swsoxzwZmYZ5YA3M8soB7yZWUY54M3MMsoBb2aWUQ54M7OMcsCbmWWUA97MLKMc8GZmGeWANzPLKAe8mVlGOeDNzDLKAW9mllEOeDOzjHLAm5llVMEDXtLtktZI+l6h92VmZkcUNOAlXQJ0j4hq4HRJZxRyf2ZmdkShR/BjgPvS+48Dowu8PzMzSykiCvfi0u3Av0bEK5LOB86KiJ80eHwKMCXdPBPYULBiOsbJwPZSF1EiXfnYoWsff1c+duj8x/+xiBjQ0gM9CrzjPcDx6f0+NPnEEBG3AbcVuIYOI6k2IqpKXUcpdOVjh659/F352OHDffyFnqJZy5FpmRHApgLvz8zMUoUewS8HnpVUCYwHRhV4f2ZmliroCD4idpOcaH0e+FJE7Crk/orgQzOdVABd+dihax9/Vz52+BAff0FPspqZWen4SlazNkg6SdJ5kk4udS1m7eGAz5GkgZKeLXUdxSbpREkrJT0u6UFJvUpdUzFJ6gf8B3AO8JSkFpejZVn6u/9SqesoJkk9JP1O0tPpbVipa8qHAz4H6R/5nUBZqWspga8CcyPifGArMK7E9RTbcGB6RPwQeAw4q8T1lML/4chy565iOHBPRIxJb+tKXVA+HPC5OQhcBuwudSHFFhELIuKJdHMA8G4p6ym2iKiJiOclfYFkFL+m1DUVk6SxwF6SN/euZBQwUdIL6fdpFXrFYUE44HMQEbszsALomEiqBvpFxPOlrqXYJInkDf494IMSl1M06XTc94EZpa6lBF4Ezo2Ic4CewAUlricvDnhrk6STgPnAN0pdSylE4krgVeDCUtdTRDOABRGxs9SFlMCrEbElvV8LfCi/KNEBb0eVjuKWAt+NiPpS11Nskr4jaVK62RfoSmF3LnClpKeBkZJ+VuJ6immxpBGSugMXA6+UuqB8eB18O0h6OiLGlLqOYpI0FfgRR37BF0bEL0tYUlGlJ9jvA44D1gNXRhf8o+lqv/uSPg3cDQh4OCJmlrikvDjgzcwyylM0ZmYZ5YA3M8soB7yZWUY54M3MMsoBb2aWUQ54M7OMcsCbmWXU/wckKuzvK4x3EQAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "x = np.arange(1,6)\n",
    "nums = [len(comments[comments['comment_length']==4][comments['star']==i]) for i in x]\n",
    "plot_score_distribution(x, nums)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>comment</th>\n",
       "      <th>star</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>中二得很</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>67</td>\n",
       "      <td>3d扣分</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>141</td>\n",
       "      <td>爱与坚持</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>162</td>\n",
       "      <td>勇敢面对</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>180</td>\n",
       "      <td>励志大片</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>208</td>\n",
       "      <td>MIT？</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>220</td>\n",
       "      <td>MIT？</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>251</td>\n",
       "      <td>平民励志</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>258</td>\n",
       "      <td>平民励志</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>265</td>\n",
       "      <td>苦尽甘来</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>268</td>\n",
       "      <td>很好看。</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>303</td>\n",
       "      <td>喜欢喜欢</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>330</td>\n",
       "      <td>很立志啊</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>445</td>\n",
       "      <td>尼玛还钱</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>459</td>\n",
       "      <td>尼玛还钱</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>475</td>\n",
       "      <td>尼玛还钱</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>484</td>\n",
       "      <td>负分滚粗</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>494</td>\n",
       "      <td>尼玛还钱</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>503</td>\n",
       "      <td>负分滚粗</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>508</td>\n",
       "      <td>尼玛还钱</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    comment  star\n",
       "4      中二得很     1\n",
       "67     3d扣分     2\n",
       "141    爱与坚持     3\n",
       "162    勇敢面对     4\n",
       "180    励志大片     5\n",
       "208    MIT？     4\n",
       "220    MIT？     4\n",
       "251    平民励志     4\n",
       "258    平民励志     4\n",
       "265    苦尽甘来     4\n",
       "268    很好看。     5\n",
       "303    喜欢喜欢     5\n",
       "330    很立志啊     5\n",
       "445    尼玛还钱     1\n",
       "459    尼玛还钱     1\n",
       "475    尼玛还钱     1\n",
       "484    负分滚粗     1\n",
       "494    尼玛还钱     1\n",
       "503    负分滚粗     1\n",
       "508    尼玛还钱     1"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "comments[comments['comment_length']==4][['comment','star']][:20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 文本向量化"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 词向量\n",
    "- 使用中文维基的预训练词向量"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = Word2Vec.load(model_path)\n",
    "wv = model.wv\n",
    "del model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "845989"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(wv.vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('张静初', 0.8863259553909302),\n",
       " ('刘烨', 0.8568055629730225),\n",
       " ('廖凡', 0.8530554175376892),\n",
       " ('张震', 0.8500816822052002),\n",
       " ('张涵予', 0.8481923341751099),\n",
       " ('胡军', 0.8465080857276917),\n",
       " ('段奕宏', 0.8462876081466675),\n",
       " ('黄轩', 0.844296932220459),\n",
       " ('黄渤', 0.8436167240142822),\n",
       " ('喻亢', 0.8426705598831177)]"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wv.most_similar(['吴京'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "48404"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "index = wv.index2word.index('吴京')\n",
    "index"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 创建词汇表"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_vocab(texts):\n",
    "    vocab = {}\n",
    "    for t in texts:\n",
    "        words = t.strip().split()\n",
    "        for word in words:\n",
    "            if word != 'unkown':\n",
    "                vocab[word] = vocab.get(word, 0) + 1\n",
    "    return vocab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'吴京': 279,\n",
       " '意淫': 279,\n",
       " '到': 10358,\n",
       " '了': 102050,\n",
       " '脑残': 316,\n",
       " '的': 328370,\n",
       " '地步': 197,\n",
       " '看': 34252,\n",
       " '恶心': 922,\n",
       " '想': 7471,\n",
       " '吐': 565,\n",
       " '首映礼': 42,\n",
       " '太': 12975,\n",
       " '恐怖': 596,\n",
       " '这个': 10269,\n",
       " '电影': 34593,\n",
       " '不讲道理': 8,\n",
       " '完全': 4152,\n",
       " '就是': 14014,\n",
       " '在': 31161,\n",
       " '实现': 270,\n",
       " '他': 10655,\n",
       " '小': 6647,\n",
       " '粉红': 39,\n",
       " '英雄': 1705,\n",
       " '梦': 871,\n",
       " '各种': 3136,\n",
       " '装备': 80,\n",
       " '轮番': 21,\n",
       " '上场': 17,\n",
       " '视': 29,\n",
       " '物理': 63,\n",
       " '逻辑': 1414,\n",
       " '于': 1783,\n",
       " '不顾': 57,\n",
       " '不得不': 670,\n",
       " '说': 11123,\n",
       " '有钱': 205,\n",
       " '真': 5180,\n",
       " '好': 22827,\n",
       " '随意': 170,\n",
       " '胡闹': 45,\n",
       " '炒作': 70,\n",
       " '水平': 819,\n",
       " '不输': 48,\n",
       " '冯小刚': 266,\n",
       " '但小刚': 1,\n",
       " '至少': 912,\n",
       " '不会': 2684,\n",
       " '用': 3944,\n",
       " '主旋律': 923,\n",
       " '来': 5251,\n",
       " '让': 13704,\n",
       " '人': 24157,\n",
       " '不': 28318,\n",
       " '舒服': 607,\n",
       " '为了': 3506,\n",
       " '而': 6535,\n",
       " '煽情': 1171,\n",
       " '觉得': 8878,\n",
       " '是': 72707,\n",
       " '个': 6664,\n",
       " '大': 5911,\n",
       " '做作': 822,\n",
       " '谎言': 266,\n",
       " '家': 564,\n",
       " '7': 1719,\n",
       " '29': 96,\n",
       " '更新': 87,\n",
       " '片子': 9578,\n",
       " '整体': 1410,\n",
       " '不如': 1795,\n",
       " '湄公河': 57,\n",
       " '行动': 187,\n",
       " '1': 3510,\n",
       " '不够': 1818,\n",
       " '流畅': 699,\n",
       " '编剧': 2270,\n",
       " '有毒': 38,\n",
       " '台词': 2368,\n",
       " '尴尬': 1655,\n",
       " '2': 3789,\n",
       " '刻意': 1040,\n",
       " '显得': 1098,\n",
       " '如此': 2686,\n",
       " '不合时宜': 33,\n",
       " '又': 11552,\n",
       " '多余': 311,\n",
       " '凭良心说': 3,\n",
       " '看到': 5457,\n",
       " '不像': 277,\n",
       " '战狼': 36,\n",
       " '续集': 710,\n",
       " '完虐': 7,\n",
       " '中二得': 5,\n",
       " '很': 34366,\n",
       " '犯': 182,\n",
       " '我': 50036,\n",
       " '中华': 55,\n",
       " '者': 549,\n",
       " '虽远必': 22,\n",
       " '诛': 33,\n",
       " '比': 6232,\n",
       " '这句': 231,\n",
       " '话': 1764,\n",
       " '还要': 780,\n",
       " '一百倍': 14,\n",
       " '脑子': 295,\n",
       " '东西': 2288,\n",
       " '希望': 1961,\n",
       " '们': 2491,\n",
       " '都': 36318,\n",
       " '能': 9621,\n",
       " '有': 27764,\n",
       " '三星': 1977,\n",
       " '半': 1928,\n",
       " '实打实': 26,\n",
       " '分': 2874,\n",
       " '第一集': 211,\n",
       " '爱国': 136,\n",
       " '内部': 115,\n",
       " '做': 4426,\n",
       " '着': 7128,\n",
       " '置换': 18,\n",
       " '与': 9532,\n",
       " '较劲': 17,\n",
       " '但': 15545,\n",
       " '第二集': 69,\n",
       " '才': 4969,\n",
       " '真正': 1483,\n",
       " '显露': 17,\n",
       " '野心': 261,\n",
       " '终于': 1775,\n",
       " '抛弃': 225,\n",
       " '李忠志': 2,\n",
       " '新增': 7,\n",
       " '外来': 28,\n",
       " '班底': 73,\n",
       " '硬件': 24,\n",
       " '实力': 194,\n",
       " '机会': 386,\n",
       " '和': 31342,\n",
       " '国际': 208,\n",
       " '接轨': 7,\n",
       " '开篇': 193,\n",
       " '水下': 33,\n",
       " '长镜头': 686,\n",
       " '诸如': 40,\n",
       " '铁丝网': 3,\n",
       " '拦截': 9,\n",
       " 'rpg': 17,\n",
       " '弹头': 8,\n",
       " '细节': 2266,\n",
       " '设计': 1177,\n",
       " '国产': 1119,\n",
       " '动作片': 1105,\n",
       " '重新': 452,\n",
       " '封顶': 4,\n",
       " '理念': 113,\n",
       " '上': 10112,\n",
       " '它': 3302,\n",
       " '甚至': 1278,\n",
       " '做到': 533,\n",
       " '绣春刀': 37,\n",
       " '最': 8578,\n",
       " '想做到': 5,\n",
       " '那': 7485,\n",
       " '部分': 1973,\n",
       " '惊险': 107,\n",
       " '大气': 181,\n",
       " '引人入胜': 103,\n",
       " '结合': 423,\n",
       " '不俗': 75,\n",
       " '快': 1207,\n",
       " '剪下': 2,\n",
       " '真刀真枪': 4,\n",
       " '不禁': 137,\n",
       " '热血沸腾': 225,\n",
       " '特别': 2693,\n",
       " '弹簧床': 3,\n",
       " '架': 56,\n",
       " '挡': 73,\n",
       " '炸弹': 100,\n",
       " '空手': 9,\n",
       " '接': 333,\n",
       " '碎玻璃': 4,\n",
       " '弹匣': 2,\n",
       " '割喉': 11,\n",
       " '等': 1770,\n",
       " '帅': 1009,\n",
       " '得': 9976,\n",
       " '飞起': 39,\n",
       " '就算': 902,\n",
       " '前半段': 588,\n",
       " '铺垫': 587,\n",
       " '节奏': 3513,\n",
       " '散漫': 64,\n",
       " '主角': 2141,\n",
       " '光环': 217,\n",
       " '开太大': 3,\n",
       " '也': 32064,\n",
       " '不怕': 135,\n",
       " '作为': 2412,\n",
       " '一个': 17832,\n",
       " '中国': 3581,\n",
       " '两个': 2953,\n",
       " '小时': 1810,\n",
       " '弥漫着': 37,\n",
       " '强大': 691,\n",
       " '不可': 516,\n",
       " '侵犯': 24,\n",
       " '氛围': 559,\n",
       " '还是': 16857,\n",
       " '那颗': 47,\n",
       " '民族': 324,\n",
       " '自豪': 24,\n",
       " '心': 1085,\n",
       " '砰砰': 53,\n",
       " '砰': 58,\n",
       " '跳个': 1,\n",
       " '不停': 436,\n",
       " '15': 341,\n",
       " '100': 340,\n",
       " '冷峰': 1,\n",
       " '这部': 7640,\n",
       " '里': 7911,\n",
       " '即': 374,\n",
       " '像': 5723,\n",
       " '成龙': 883,\n",
       " '像杰': 2,\n",
       " '森斯坦': 150,\n",
       " '森': 157,\n",
       " '体制': 213,\n",
       " '外': 501,\n",
       " '同': 556,\n",
       " '类型': 1509,\n",
       " '总是': 1988,\n",
       " '代表': 471,\n",
       " '个人': 1741,\n",
       " '无能': 185,\n",
       " '政府': 244,\n",
       " '需要': 1821,\n",
       " '求助于': 4,\n",
       " '这些': 1479,\n",
       " '才能': 836,\n",
       " '解决': 280,\n",
       " '难题': 53,\n",
       " '体现': 435,\n",
       " '价值': 334,\n",
       " '所以': 2768,\n",
       " '照抄': 24,\n",
       " '这种': 6498,\n",
       " '模式': 481,\n",
       " '实际上': 187,\n",
       " '问题': 2398,\n",
       " '我们': 6039,\n",
       " '以前': 1095,\n",
       " '嘲笑': 88,\n",
       " '英雄主义': 283,\n",
       " '却': 6254,\n",
       " '没想到': 937,\n",
       " '捆绑': 34,\n",
       " '爱国主义': 87,\n",
       " '全能': 17,\n",
       " '战士': 111,\n",
       " '更加': 722,\n",
       " '难以': 482,\n",
       " '下咽': 11,\n",
       " '多': 9221,\n",
       " '无脑': 258,\n",
       " '信': 220,\n",
       " '戏': 3717,\n",
       " '对': 10176,\n",
       " '吴京路': 1,\n",
       " '转粉': 31,\n",
       " '最后': 9966,\n",
       " '彩蛋': 561,\n",
       " '没有': 14757,\n",
       " '理由': 374,\n",
       " '期待': 1433,\n",
       " '下': 3549,\n",
       " '一部': 9687,\n",
       " '假': 631,\n",
       " '嗨': 214,\n",
       " '几处': 175,\n",
       " '情节': 3993,\n",
       " '设置': 521,\n",
       " '过于': 1025,\n",
       " '彰显': 67,\n",
       " '国家': 828,\n",
       " '自豪感': 5,\n",
       " '稍显': 201,\n",
       " '突兀': 390,\n",
       " '爽片': 21,\n",
       " '打戏': 356,\n",
       " '挺燃': 12,\n",
       " '但是': 7720,\n",
       " '故事': 14999,\n",
       " '一般': 3269,\n",
       " '达康': 13,\n",
       " '书记': 19,\n",
       " '合适': 297,\n",
       " '角色': 3732,\n",
       " '赵': 45,\n",
       " '东来': 11,\n",
       " '倒': 1608,\n",
       " '张瀚': 6,\n",
       " '太太': 199,\n",
       " '太违': 15,\n",
       " '分钟': 1828,\n",
       " '穿越': 528,\n",
       " '回': 399,\n",
       " '偶像剧': 112,\n",
       " '接到': 24,\n",
       " '非洲': 134,\n",
       " '卧底': 189,\n",
       " '冷锋': 14,\n",
       " '报告': 26,\n",
       " '丁义珍': 6,\n",
       " '现在': 3750,\n",
       " '请求': 13,\n",
       " '抓捕': 8,\n",
       " '李达康': 4,\n",
       " '这件': 111,\n",
       " '事先': 24,\n",
       " '不要': 2509,\n",
       " '声张': 4,\n",
       " '别': 731,\n",
       " '省厅': 3,\n",
       " '知道': 5386,\n",
       " '就': 25673,\n",
       " '你': 17223,\n",
       " '一起': 2803,\n",
       " '去': 7294,\n",
       " '加上': 901,\n",
       " '同志': 267,\n",
       " '三人': 130,\n",
       " '逮捕': 12,\n",
       " '这次': 940,\n",
       " '行': 889,\n",
       " '叫': 2034,\n",
       " '吧': 10654,\n",
       " '拍': 8200,\n",
       " '喜剧': 2727,\n",
       " '整个': 1785,\n",
       " '感觉': 8011,\n",
       " '挺': 6172,\n",
       " '搞笑': 2355,\n",
       " '这么': 6894,\n",
       " '打': 3941,\n",
       " '过': 3589,\n",
       " '徐晓冬': 1,\n",
       " '么': 3586,\n",
       " '心往': 3,\n",
       " '一处': 76,\n",
       " '劲往': 3,\n",
       " '使': 490,\n",
       " '梦想': 1192,\n",
       " '看吧': 154,\n",
       " '第一部': 2310,\n",
       " '好太多': 93,\n",
       " '谢谢': 238,\n",
       " '美队': 125,\n",
       " '动作': 3381,\n",
       " '指导': 121,\n",
       " '这': 17400,\n",
       " '火': 177,\n",
       " '没见识': 5,\n",
       " '开头': 1341,\n",
       " '长': 1079,\n",
       " '对决': 215,\n",
       " '戏可算': 1,\n",
       " '华语': 297,\n",
       " '顶尖': 27,\n",
       " '存在': 1295,\n",
       " '驱逐舰': 4,\n",
       " '导弹': 25,\n",
       " '坦克': 215,\n",
       " '商业片': 544,\n",
       " '狂用': 1,\n",
       " '镜头': 4294,\n",
       " '运用': 397,\n",
       " '笑': 3690,\n",
       " '点': 3730,\n",
       " '插入': 85,\n",
       " '好莱坞': 1262,\n",
       " '爆米花': 560,\n",
       " '不功': 27,\n",
       " '不过': 6527,\n",
       " '从头': 342,\n",
       " '打到': 67,\n",
       " '尾': 347,\n",
       " '拼': 313,\n",
       " '虽然': 5822,\n",
       " '有略': 4,\n",
       " '乱': 653,\n",
       " '时': 2957,\n",
       " '因为': 4085,\n",
       " '没': 11089,\n",
       " '啥': 1845,\n",
       " '期望值': 56,\n",
       " '被': 9951,\n",
       " '吓了一跳': 10,\n",
       " '吴刚': 10,\n",
       " '谦和': 6,\n",
       " '丁海峰': 1,\n",
       " '老': 2995,\n",
       " '三位': 111,\n",
       " '炖': 45,\n",
       " '烂熟': 8,\n",
       " '牛筋': 1,\n",
       " '嚼': 51,\n",
       " '用心': 635,\n",
       " '啊': 20659,\n",
       " '导演': 8665,\n",
       " '小看': 21,\n",
       " '确实': 2167,\n",
       " '下功夫': 27,\n",
       " '拉': 399,\n",
       " '借鉴': 174,\n",
       " '至于': 474,\n",
       " '大家': 1361,\n",
       " '比较': 3302,\n",
       " '反感': 144,\n",
       " '情绪': 1075,\n",
       " '那些': 2407,\n",
       " '桥段': 954,\n",
       " '必备': 57,\n",
       " '稍微': 405,\n",
       " '一点': 2990,\n",
       " '还': 17565,\n",
       " '可以': 8890,\n",
       " '接受': 925,\n",
       " '最好': 1951,\n",
       " '地方': 2173,\n",
       " '掌握': 151,\n",
       " '张弛': 68,\n",
       " '有度': 61,\n",
       " '这点': 282,\n",
       " '难得': 879,\n",
       " '一直': 3317,\n",
       " '脑子里': 62,\n",
       " '回响': 39,\n",
       " '片头': 357,\n",
       " '海里': 23,\n",
       " '那场': 491,\n",
       " '戏看': 22,\n",
       " '完': 5016,\n",
       " '呆': 245,\n",
       " '下去': 879,\n",
       " '太假': 193,\n",
       " '提前': 195,\n",
       " '离场': 130,\n",
       " '好看': 7701,\n",
       " '演技': 5621,\n",
       " '棒呆': 31,\n",
       " '符合': 413,\n",
       " '反而': 828,\n",
       " '更': 6109,\n",
       " '差': 1706,\n",
       " '这一': 125,\n",
       " '放之四海而皆准': 3,\n",
       " '规律': 30,\n",
       " '场面': 2286,\n",
       " '越做越': 10,\n",
       " '然而': 793,\n",
       " '伴随': 124,\n",
       " '特效': 1974,\n",
       " '升级': 123,\n",
       " '叙事': 2098,\n",
       " '变得': 605,\n",
       " '非常': 4843,\n",
       " '凌乱': 212,\n",
       " '格局': 339,\n",
       " '颇': 391,\n",
       " '拍成': 464,\n",
       " '黑鹰坠落': 30,\n",
       " '结果': 1813,\n",
       " '撑死': 16,\n",
       " '最多': 100,\n",
       " '只是': 4107,\n",
       " '官方': 66,\n",
       " '版': 2194,\n",
       " '敢死队': 64,\n",
       " '但论': 8,\n",
       " '自我': 769,\n",
       " '角色定位': 15,\n",
       " '能力': 616,\n",
       " '远': 380,\n",
       " '如同': 318,\n",
       " '演员': 5510,\n",
       " '出身': 113,\n",
       " '甄子丹': 387,\n",
       " '喜欢': 13814,\n",
       " '不是': 8790,\n",
       " '装傻': 30,\n",
       " '真傻': 18,\n",
       " '要不是': 329,\n",
       " '真的': 7906,\n",
       " '别的': 445,\n",
       " '可': 2093,\n",
       " '肯定': 756,\n",
       " '选': 322,\n",
       " '直男癌': 59,\n",
       " '令人发指': 105,\n",
       " '所有': 2058,\n",
       " '剧情': 11609,\n",
       " '走向': 374,\n",
       " '九十年代': 112,\n",
       " '那套': 56,\n",
       " '照搬': 101,\n",
       " '审美': 269,\n",
       " '事儿': 324,\n",
       " '一时半会儿': 2,\n",
       " '培养': 50,\n",
       " '出来': 3402,\n",
       " '整部': 1077,\n",
       " '延续': 368,\n",
       " '风格': 2810,\n",
       " '热血': 739,\n",
       " '要': 8115,\n",
       " '不错': 10223,\n",
       " '适合': 1841,\n",
       " '演': 3436,\n",
       " '军人': 81,\n",
       " '之前': 1517,\n",
       " '片段': 555,\n",
       " '念': 132,\n",
       " '劲儿': 133,\n",
       " '来说': 1554,\n",
       " '张翰太违': 1,\n",
       " '一': 3707,\n",
       " '一股': 327,\n",
       " '雷阵雨': 3,\n",
       " '画风': 447,\n",
       " '目瞪狗': 2,\n",
       " '瘠薄': 3,\n",
       " '人牛': 5,\n",
       " 'b': 941,\n",
       " '硬道理': 18,\n",
       " '隔壁': 128,\n",
       " '建军': 6,\n",
       " '大爷': 195,\n",
       " '你们': 1878,\n",
       " '场景': 1666,\n",
       " '战斗': 355,\n",
       " '全线': 14,\n",
       " '打斗': 1159,\n",
       " '游走': 49,\n",
       " '审查': 133,\n",
       " '红线': 12,\n",
       " '边界': 27,\n",
       " '政治': 1053,\n",
       " '安全': 124,\n",
       " '缝隙': 17,\n",
       " '部': 861,\n",
       " '极具': 157,\n",
       " '煽动': 32,\n",
       " '大片': 1031,\n",
       " '制作': 1219,\n",
       " '精良': 215,\n",
       " '影片': 4879,\n",
       " '请': 958,\n",
       " '多来': 7,\n",
       " '胶卷': 17,\n",
       " '挺差': 8,\n",
       " '过度': 269,\n",
       " '部队': 81,\n",
       " '没太多': 38,\n",
       " '展示': 353,\n",
       " '死去': 247,\n",
       " '反正': 625,\n",
       " '吸引': 807,\n",
       " '冲': 254,\n",
       " '为什么': 3226,\n",
       " '鄙视': 87,\n",
       " '敢': 350,\n",
       " '开拓': 23,\n",
       " '允许': 69,\n",
       " '他们': 3130,\n",
       " '再': 5339,\n",
       " '直到': 337,\n",
       " '更好': 1000,\n",
       " '拍出': 576,\n",
       " '棒': 979,\n",
       " '出彩': 1005,\n",
       " '呢': 4496,\n",
       " '火爆': 192,\n",
       " '本片': 1954,\n",
       " '必将': 38,\n",
       " '燃爆': 58,\n",
       " '暑期': 54,\n",
       " '厉害': 847,\n",
       " '身为': 125,\n",
       " '武打': 320,\n",
       " '高标准': 3,\n",
       " '枪战': 696,\n",
       " '为': 4903,\n",
       " '点赞': 113,\n",
       " '热血男儿': 2,\n",
       " '荷尔蒙': 159,\n",
       " '爆发': 406,\n",
       " '给': 10567,\n",
       " '0': 919,\n",
       " '星': 2726,\n",
       " '血战': 39,\n",
       " '钢锯': 19,\n",
       " '岭': 34,\n",
       " '会': 7747,\n",
       " '歌颂': 73,\n",
       " '宗教': 719,\n",
       " '情怀': 958,\n",
       " '超越': 578,\n",
       " '政权': 33,\n",
       " '当': 2875,\n",
       " '只': 3334,\n",
       " '明显': 1200,\n",
       " '低': 756,\n",
       " '层次': 173,\n",
       " '充满': 1395,\n",
       " '现实': 2238,\n",
       " '乃至': 76,\n",
       " '投机': 29,\n",
       " '考量': 29,\n",
       " '高下': 49,\n",
       " '立': 90,\n",
       " '见': 737,\n",
       " '请问': 99,\n",
       " '吴京脑': 3,\n",
       " '残': 140,\n",
       " '火箭炮': 6,\n",
       " '吗': 4966,\n",
       " '傲气': 6,\n",
       " '雄鹰': 3,\n",
       " '第一': 530,\n",
       " '滴血': 26,\n",
       " '4': 2237,\n",
       " '算是': 1689,\n",
       " '国内': 510,\n",
       " '片': 6598,\n",
       " '准': 47,\n",
       " '钱': 938,\n",
       " '花': 442,\n",
       " '有效': 62,\n",
       " '气魄': 22,\n",
       " '创作': 266,\n",
       " '足够': 607,\n",
       " '真诚': 293,\n",
       " '人物': 3997,\n",
       " '连': 1653,\n",
       " '张翰': 38,\n",
       " '可爱': 3080,\n",
       " '如果': 3614,\n",
       " '当年': 1479,\n",
       " '那样': 1064,\n",
       " '一时': 103,\n",
       " '膨胀': 34,\n",
       " '银幕': 474,\n",
       " '独占': 5,\n",
       " '聚光灯': 9,\n",
       " '走': 1806,\n",
       " '扪心自问': 8,\n",
       " '没法': 381,\n",
       " '评价': 588,\n",
       " '全片': 1218,\n",
       " '靠': 1170,\n",
       " '戏撑': 2,\n",
       " '文戏': 309,\n",
       " '扯淡': 236,\n",
       " '女主角': 1429,\n",
       " '毫无': 1353,\n",
       " '必要': 476,\n",
       " '只要': 716,\n",
       " '开挂': 105,\n",
       " '牛': 1580,\n",
       " '逼': 3575,\n",
       " '之处': 118,\n",
       " '在于': 671,\n",
       " '透露': 122,\n",
       " '极': 238,\n",
       " '强烈': 550,\n",
       " '意识形态': 117,\n",
       " '枷锁': 32,\n",
       " '祖国': 67,\n",
       " '面前': 408,\n",
       " '一切': 1961,\n",
       " '反动派': 5,\n",
       " '纸老虎': 13,\n",
       " '人开': 3,\n",
       " '挂': 276,\n",
       " '团灭': 13,\n",
       " '合情合理': 33,\n",
       " '两星': 735,\n",
       " '鼓励': 462,\n",
       " '其他': 1757,\n",
       " '一般般': 410,\n",
       " '看点': 512,\n",
       " '有点': 7632,\n",
       " '手接': 2,\n",
       " '哈哈哈': 1620,\n",
       " '从': 4331,\n",
       " '之后': 2323,\n",
       " '炸': 208,\n",
       " '翻': 267,\n",
       " '一下': 1818,\n",
       " '四星': 1087,\n",
       " '当时': 1066,\n",
       " '其实': 5782,\n",
       " '完成度': 199,\n",
       " '接近': 310,\n",
       " '每个': 2145,\n",
       " '步骤': 8,\n",
       " '顺滑': 6,\n",
       " '任何': 1177,\n",
       " '出人意料': 76,\n",
       " '是因为': 882,\n",
       " '看看': 1627,\n",
       " '最近': 763,\n",
       " '世界': 3435,\n",
       " '抱歉': 76,\n",
       " '影院': 788,\n",
       " '燃': 172,\n",
       " '起来': 2037,\n",
       " '魔幻': 373,\n",
       " '当然': 1233,\n",
       " '强拆': 38,\n",
       " '现实感': 16,\n",
       " '一幕': 481,\n",
       " '开场': 456,\n",
       " '6': 1216,\n",
       " '搏斗': 38,\n",
       " '从来': 313,\n",
       " '其它': 192,\n",
       " '拍摄': 964,\n",
       " '难度': 72,\n",
       " '同时': 732,\n",
       " '技能': 81,\n",
       " '方面': 809,\n",
       " '要求': 318,\n",
       " '回来': 523,\n",
       " '搜': 51,\n",
       " '吴京会': 1,\n",
       " '游泳': 37,\n",
       " '潜水': 9,\n",
       " '滑雪': 14,\n",
       " '开': 512,\n",
       " '飞机': 551,\n",
       " '射击': 41,\n",
       " '各项': 6,\n",
       " '特意': 108,\n",
       " '特种部队': 36,\n",
       " '当过': 7,\n",
       " '18': 228,\n",
       " '月': 606,\n",
       " '兵': 46,\n",
       " '佩服': 344,\n",
       " '这样': 6666,\n",
       " '3': 3620,\n",
       " '星半': 149,\n",
       " '结束': 891,\n",
       " '掌声': 52,\n",
       " '出现': 1635,\n",
       " '近期': 118,\n",
       " '少见': 110,\n",
       " '一粒': 17,\n",
       " '大补丸': 1,\n",
       " '有人': 938,\n",
       " '吃': 1257,\n",
       " '开心': 773,\n",
       " '补大': 1,\n",
       " '从白': 1,\n",
       " '黑': 1002,\n",
       " '字幕': 768,\n",
       " '展现': 668,\n",
       " '超级': 1296,\n",
       " '直': 232,\n",
       " '男': 1878,\n",
       " '糙': 67,\n",
       " '猛': 130,\n",
       " '媲美': 80,\n",
       " '终结者': 69,\n",
       " '5': 3353,\n",
       " '无亮点': 63,\n",
       " '张翰变': 1,\n",
       " '谐星': 25,\n",
       " '3d': 1055,\n",
       " '掌控': 194,\n",
       " '逼近': 26,\n",
       " 'hold': 101,\n",
       " '不住': 212,\n",
       " '边缘': 168,\n",
       " '带感': 177,\n",
       " '拳拳': 105,\n",
       " '肉': 293,\n",
       " '超爽': 13,\n",
       " '聪明': 349,\n",
       " '鸡': 163,\n",
       " '贼': 84,\n",
       " '一面': 296,\n",
       " '旗下': 9,\n",
       " '呈现': 436,\n",
       " '一出': 296,\n",
       " '重工业': 4,\n",
       " '娱乐': 492,\n",
       " '调控': 9,\n",
       " '说教': 313,\n",
       " '比例': 43,\n",
       " '尺度': 197,\n",
       " '大众': 203,\n",
       " '接纳': 26,\n",
       " '把握': 368,\n",
       " '微妙': 224,\n",
       " '其中': 759,\n",
       " '一些': 2104,\n",
       " '奇侠': 9,\n",
       " '化': 629,\n",
       " '内容': 1011,\n",
       " '比如': 613,\n",
       " '玻璃碴': 2,\n",
       " '子当': 1,\n",
       " '飞镖': 14,\n",
       " '杀敌': 10,\n",
       " '一类': 108,\n",
       " '只不过': 325,\n",
       " '遮盖': 13,\n",
       " '掉': 689,\n",
       " '老爹': 81,\n",
       " '演过': 76,\n",
       " '美剧': 80,\n",
       " '搏击': 76,\n",
       " '王国': 47,\n",
       " '力荐': 104,\n",
       " '那部': 124,\n",
       " '为啥': 477,\n",
       " '奇异': 83,\n",
       " '恩典': 2,\n",
       " '配乐': 2802,\n",
       " '画内': 1,\n",
       " '男生': 186,\n",
       " '的话': 1392,\n",
       " '应该': 3298,\n",
       " '刺激': 619,\n",
       " '肾上腺素': 67,\n",
       " '女生': 334,\n",
       " '对龙小云': 1,\n",
       " '感情': 1736,\n",
       " '十分': 830,\n",
       " '打动': 598,\n",
       " '模仿': 570,\n",
       " '许多': 551,\n",
       " '怎么': 4810,\n",
       " '玩': 1228,\n",
       " '一股脑': 15,\n",
       " '堆': 40,\n",
       " '槽': 584,\n",
       " '几位': 177,\n",
       " '血厚到': 1,\n",
       " '科幻': 698,\n",
       " '级别': 187,\n",
       " '重复': 359,\n",
       " '满血': 15,\n",
       " '红血': 1,\n",
       " '中毒': 17,\n",
       " '极速': 34,\n",
       " '回血': 5,\n",
       " '爆种': 3,\n",
       " '打通': 14,\n",
       " '全场': 345,\n",
       " '太过': 616,\n",
       " '投机取巧': 10,\n",
       " '穿': 516,\n",
       " '迈克尔': 84,\n",
       " '贝都': 1,\n",
       " '不受': 10,\n",
       " '待见': 65,\n",
       " '国片': 18,\n",
       " '前仆后继': 9,\n",
       " '爆炸': 371,\n",
       " 'high': 277,\n",
       " '瞎燃': 1,\n",
       " '没用': 136,\n",
       " '10': 1476,\n",
       " '女人': 2365,\n",
       " '缺': 169,\n",
       " '男人': 2372,\n",
       " '征服': 75,\n",
       " '吴京直': 1,\n",
       " '男癌': 34,\n",
       " '🇨': 1,\n",
       " '🇳': 1,\n",
       " '美国': 2556,\n",
       " '不行': 1061,\n",
       " '死': 3816,\n",
       " '全都': 247,\n",
       " '跳': 490,\n",
       " '跟': 4138,\n",
       " '跳墙': 3,\n",
       " '一样': 4727,\n",
       " '拯救': 678,\n",
       " '国产片': 357,\n",
       " '以': 1962,\n",
       " '中印': 3,\n",
       " '局势': 11,\n",
       " '对比': 464,\n",
       " '假想': 8,\n",
       " '真是': 6447,\n",
       " '讽刺': 759,\n",
       " '谄媚': 19,\n",
       " '军旅': 11,\n",
       " '题材': 2762,\n",
       " '质感': 368,\n",
       " '燃到': 34,\n",
       " '国外': 150,\n",
       " '精彩': 2808,\n",
       " '看着': 1928,\n",
       " '有力': 224,\n",
       " '必须': 693,\n",
       " '安利': 43,\n",
       " '一下张': 2,\n",
       " '翰': 3,\n",
       " '简直': 2427,\n",
       " '承包': 27,\n",
       " '笑点': 1119,\n",
       " '量身定做': 34,\n",
       " '彭于': 223,\n",
       " '晏': 205,\n",
       " '可演': 6,\n",
       " '不来': 67,\n",
       " '不少': 1113,\n",
       " '漂移': 19,\n",
       " '无人机': 41,\n",
       " '突袭': 59,\n",
       " '直升机': 68,\n",
       " '坠露': 2,\n",
       " '肉搏': 96,\n",
       " '军舰': 13,\n",
       " '发射': 18,\n",
       " '叛乱': 4,\n",
       " '国际化': 22,\n",
       " '视角': 778,\n",
       " '标配': 55,\n",
       " '饰演': 260,\n",
       " '深入人心': 49,\n",
       " '搏命': 41,\n",
       " '精神': 990,\n",
       " '当下': 254,\n",
       " '第三部': 307,\n",
       " '好燃': 15,\n",
       " '表白': 124,\n",
       " '典型': 884,\n",
       " '方式': 1254,\n",
       " '每次': 638,\n",
       " '猜': 552,\n",
       " '没劲': 200,\n",
       " '诶': 264,\n",
       " '问': 421,\n",
       " '王牌': 84,\n",
       " '特工': 401,\n",
       " '那么': 6562,\n",
       " '杀人': 525,\n",
       " '经过': 190,\n",
       " '艺术': 694,\n",
       " '处理': 1053,\n",
       " '直接': 1101,\n",
       " '删': 122,\n",
       " '血腥': 583,\n",
       " '屠杀': 63,\n",
       " '赤裸裸': 113,\n",
       " '大段': 149,\n",
       " '正确': 300,\n",
       " '庇衣': 1,\n",
       " '意料之中': 88,\n",
       " '意料之外': 67,\n",
       " '惊喜': 1325,\n",
       " '属于': 748,\n",
       " '狼性': 4,\n",
       " '军魂': 2,\n",
       " '几个': 1927,\n",
       " '网红拉': 1,\n",
       " '弹弹琴': 2,\n",
       " '大国': 36,\n",
       " '气象': 12,\n",
       " '满屏': 79,\n",
       " 'tm': 344,\n",
       " '告诉': 936,\n",
       " '吴': 105,\n",
       " '迪塞尔': 58,\n",
       " '如入无人之境': 4,\n",
       " '亿': 113,\n",
       " '大陆': 716,\n",
       " '一刻': 195,\n",
       " '集体': 273,\n",
       " '勃起': 11,\n",
       " '离开': 469,\n",
       " '影厅': 26,\n",
       " '屌丝': 380,\n",
       " '同样': 867,\n",
       " '开始': 3351,\n",
       " '前': 2660,\n",
       " '屌': 347,\n",
       " ...}"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vocab = build_vocab(comments['cleaned_comment'])\n",
    "vocab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "评论总单词数：4558546；词汇表单词个数：139721\n"
     ]
    }
   ],
   "source": [
    "print(\"评论总单词数：{}；词汇表单词个数：{}\".format(sum(vocab.values()), len(vocab)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "word2index = {'unkown':0}\n",
    "for word,_ in vocab.items():\n",
    "    word2index[word] = len(word2index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'unkown': 0,\n",
       " '吴京': 1,\n",
       " '意淫': 2,\n",
       " '到': 3,\n",
       " '了': 4,\n",
       " '脑残': 5,\n",
       " '的': 6,\n",
       " '地步': 7,\n",
       " '看': 8,\n",
       " '恶心': 9,\n",
       " '想': 10,\n",
       " '吐': 11,\n",
       " '首映礼': 12,\n",
       " '太': 13,\n",
       " '恐怖': 14,\n",
       " '这个': 15,\n",
       " '电影': 16,\n",
       " '不讲道理': 17,\n",
       " '完全': 18,\n",
       " '就是': 19,\n",
       " '在': 20,\n",
       " '实现': 21,\n",
       " '他': 22,\n",
       " '小': 23,\n",
       " '粉红': 24,\n",
       " '英雄': 25,\n",
       " '梦': 26,\n",
       " '各种': 27,\n",
       " '装备': 28,\n",
       " '轮番': 29,\n",
       " '上场': 30,\n",
       " '视': 31,\n",
       " '物理': 32,\n",
       " '逻辑': 33,\n",
       " '于': 34,\n",
       " '不顾': 35,\n",
       " '不得不': 36,\n",
       " '说': 37,\n",
       " '有钱': 38,\n",
       " '真': 39,\n",
       " '好': 40,\n",
       " '随意': 41,\n",
       " '胡闹': 42,\n",
       " '炒作': 43,\n",
       " '水平': 44,\n",
       " '不输': 45,\n",
       " '冯小刚': 46,\n",
       " '但小刚': 47,\n",
       " '至少': 48,\n",
       " '不会': 49,\n",
       " '用': 50,\n",
       " '主旋律': 51,\n",
       " '来': 52,\n",
       " '让': 53,\n",
       " '人': 54,\n",
       " '不': 55,\n",
       " '舒服': 56,\n",
       " '为了': 57,\n",
       " '而': 58,\n",
       " '煽情': 59,\n",
       " '觉得': 60,\n",
       " '是': 61,\n",
       " '个': 62,\n",
       " '大': 63,\n",
       " '做作': 64,\n",
       " '谎言': 65,\n",
       " '家': 66,\n",
       " '7': 67,\n",
       " '29': 68,\n",
       " '更新': 69,\n",
       " '片子': 70,\n",
       " '整体': 71,\n",
       " '不如': 72,\n",
       " '湄公河': 73,\n",
       " '行动': 74,\n",
       " '1': 75,\n",
       " '不够': 76,\n",
       " '流畅': 77,\n",
       " '编剧': 78,\n",
       " '有毒': 79,\n",
       " '台词': 80,\n",
       " '尴尬': 81,\n",
       " '2': 82,\n",
       " '刻意': 83,\n",
       " '显得': 84,\n",
       " '如此': 85,\n",
       " '不合时宜': 86,\n",
       " '又': 87,\n",
       " '多余': 88,\n",
       " '凭良心说': 89,\n",
       " '看到': 90,\n",
       " '不像': 91,\n",
       " '战狼': 92,\n",
       " '续集': 93,\n",
       " '完虐': 94,\n",
       " '中二得': 95,\n",
       " '很': 96,\n",
       " '犯': 97,\n",
       " '我': 98,\n",
       " '中华': 99,\n",
       " '者': 100,\n",
       " '虽远必': 101,\n",
       " '诛': 102,\n",
       " '比': 103,\n",
       " '这句': 104,\n",
       " '话': 105,\n",
       " '还要': 106,\n",
       " '一百倍': 107,\n",
       " '脑子': 108,\n",
       " '东西': 109,\n",
       " '希望': 110,\n",
       " '们': 111,\n",
       " '都': 112,\n",
       " '能': 113,\n",
       " '有': 114,\n",
       " '三星': 115,\n",
       " '半': 116,\n",
       " '实打实': 117,\n",
       " '分': 118,\n",
       " '第一集': 119,\n",
       " '爱国': 120,\n",
       " '内部': 121,\n",
       " '做': 122,\n",
       " '着': 123,\n",
       " '置换': 124,\n",
       " '与': 125,\n",
       " '较劲': 126,\n",
       " '但': 127,\n",
       " '第二集': 128,\n",
       " '才': 129,\n",
       " '真正': 130,\n",
       " '显露': 131,\n",
       " '野心': 132,\n",
       " '终于': 133,\n",
       " '抛弃': 134,\n",
       " '李忠志': 135,\n",
       " '新增': 136,\n",
       " '外来': 137,\n",
       " '班底': 138,\n",
       " '硬件': 139,\n",
       " '实力': 140,\n",
       " '机会': 141,\n",
       " '和': 142,\n",
       " '国际': 143,\n",
       " '接轨': 144,\n",
       " '开篇': 145,\n",
       " '水下': 146,\n",
       " '长镜头': 147,\n",
       " '诸如': 148,\n",
       " '铁丝网': 149,\n",
       " '拦截': 150,\n",
       " 'rpg': 151,\n",
       " '弹头': 152,\n",
       " '细节': 153,\n",
       " '设计': 154,\n",
       " '国产': 155,\n",
       " '动作片': 156,\n",
       " '重新': 157,\n",
       " '封顶': 158,\n",
       " '理念': 159,\n",
       " '上': 160,\n",
       " '它': 161,\n",
       " '甚至': 162,\n",
       " '做到': 163,\n",
       " '绣春刀': 164,\n",
       " '最': 165,\n",
       " '想做到': 166,\n",
       " '那': 167,\n",
       " '部分': 168,\n",
       " '惊险': 169,\n",
       " '大气': 170,\n",
       " '引人入胜': 171,\n",
       " '结合': 172,\n",
       " '不俗': 173,\n",
       " '快': 174,\n",
       " '剪下': 175,\n",
       " '真刀真枪': 176,\n",
       " '不禁': 177,\n",
       " '热血沸腾': 178,\n",
       " '特别': 179,\n",
       " '弹簧床': 180,\n",
       " '架': 181,\n",
       " '挡': 182,\n",
       " '炸弹': 183,\n",
       " '空手': 184,\n",
       " '接': 185,\n",
       " '碎玻璃': 186,\n",
       " '弹匣': 187,\n",
       " '割喉': 188,\n",
       " '等': 189,\n",
       " '帅': 190,\n",
       " '得': 191,\n",
       " '飞起': 192,\n",
       " '就算': 193,\n",
       " '前半段': 194,\n",
       " '铺垫': 195,\n",
       " '节奏': 196,\n",
       " '散漫': 197,\n",
       " '主角': 198,\n",
       " '光环': 199,\n",
       " '开太大': 200,\n",
       " '也': 201,\n",
       " '不怕': 202,\n",
       " '作为': 203,\n",
       " '一个': 204,\n",
       " '中国': 205,\n",
       " '两个': 206,\n",
       " '小时': 207,\n",
       " '弥漫着': 208,\n",
       " '强大': 209,\n",
       " '不可': 210,\n",
       " '侵犯': 211,\n",
       " '氛围': 212,\n",
       " '还是': 213,\n",
       " '那颗': 214,\n",
       " '民族': 215,\n",
       " '自豪': 216,\n",
       " '心': 217,\n",
       " '砰砰': 218,\n",
       " '砰': 219,\n",
       " '跳个': 220,\n",
       " '不停': 221,\n",
       " '15': 222,\n",
       " '100': 223,\n",
       " '冷峰': 224,\n",
       " '这部': 225,\n",
       " '里': 226,\n",
       " '即': 227,\n",
       " '像': 228,\n",
       " '成龙': 229,\n",
       " '像杰': 230,\n",
       " '森斯坦': 231,\n",
       " '森': 232,\n",
       " '体制': 233,\n",
       " '外': 234,\n",
       " '同': 235,\n",
       " '类型': 236,\n",
       " '总是': 237,\n",
       " '代表': 238,\n",
       " '个人': 239,\n",
       " '无能': 240,\n",
       " '政府': 241,\n",
       " '需要': 242,\n",
       " '求助于': 243,\n",
       " '这些': 244,\n",
       " '才能': 245,\n",
       " '解决': 246,\n",
       " '难题': 247,\n",
       " '体现': 248,\n",
       " '价值': 249,\n",
       " '所以': 250,\n",
       " '照抄': 251,\n",
       " '这种': 252,\n",
       " '模式': 253,\n",
       " '实际上': 254,\n",
       " '问题': 255,\n",
       " '我们': 256,\n",
       " '以前': 257,\n",
       " '嘲笑': 258,\n",
       " '英雄主义': 259,\n",
       " '却': 260,\n",
       " '没想到': 261,\n",
       " '捆绑': 262,\n",
       " '爱国主义': 263,\n",
       " '全能': 264,\n",
       " '战士': 265,\n",
       " '更加': 266,\n",
       " '难以': 267,\n",
       " '下咽': 268,\n",
       " '多': 269,\n",
       " '无脑': 270,\n",
       " '信': 271,\n",
       " '戏': 272,\n",
       " '对': 273,\n",
       " '吴京路': 274,\n",
       " '转粉': 275,\n",
       " '最后': 276,\n",
       " '彩蛋': 277,\n",
       " '没有': 278,\n",
       " '理由': 279,\n",
       " '期待': 280,\n",
       " '下': 281,\n",
       " '一部': 282,\n",
       " '假': 283,\n",
       " '嗨': 284,\n",
       " '几处': 285,\n",
       " '情节': 286,\n",
       " '设置': 287,\n",
       " '过于': 288,\n",
       " '彰显': 289,\n",
       " '国家': 290,\n",
       " '自豪感': 291,\n",
       " '稍显': 292,\n",
       " '突兀': 293,\n",
       " '爽片': 294,\n",
       " '打戏': 295,\n",
       " '挺燃': 296,\n",
       " '但是': 297,\n",
       " '故事': 298,\n",
       " '一般': 299,\n",
       " '达康': 300,\n",
       " '书记': 301,\n",
       " '合适': 302,\n",
       " '角色': 303,\n",
       " '赵': 304,\n",
       " '东来': 305,\n",
       " '倒': 306,\n",
       " '张瀚': 307,\n",
       " '太太': 308,\n",
       " '太违': 309,\n",
       " '分钟': 310,\n",
       " '穿越': 311,\n",
       " '回': 312,\n",
       " '偶像剧': 313,\n",
       " '接到': 314,\n",
       " '非洲': 315,\n",
       " '卧底': 316,\n",
       " '冷锋': 317,\n",
       " '报告': 318,\n",
       " '丁义珍': 319,\n",
       " '现在': 320,\n",
       " '请求': 321,\n",
       " '抓捕': 322,\n",
       " '李达康': 323,\n",
       " '这件': 324,\n",
       " '事先': 325,\n",
       " '不要': 326,\n",
       " '声张': 327,\n",
       " '别': 328,\n",
       " '省厅': 329,\n",
       " '知道': 330,\n",
       " '就': 331,\n",
       " '你': 332,\n",
       " '一起': 333,\n",
       " '去': 334,\n",
       " '加上': 335,\n",
       " '同志': 336,\n",
       " '三人': 337,\n",
       " '逮捕': 338,\n",
       " '这次': 339,\n",
       " '行': 340,\n",
       " '叫': 341,\n",
       " '吧': 342,\n",
       " '拍': 343,\n",
       " '喜剧': 344,\n",
       " '整个': 345,\n",
       " '感觉': 346,\n",
       " '挺': 347,\n",
       " '搞笑': 348,\n",
       " '这么': 349,\n",
       " '打': 350,\n",
       " '过': 351,\n",
       " '徐晓冬': 352,\n",
       " '么': 353,\n",
       " '心往': 354,\n",
       " '一处': 355,\n",
       " '劲往': 356,\n",
       " '使': 357,\n",
       " '梦想': 358,\n",
       " '看吧': 359,\n",
       " '第一部': 360,\n",
       " '好太多': 361,\n",
       " '谢谢': 362,\n",
       " '美队': 363,\n",
       " '动作': 364,\n",
       " '指导': 365,\n",
       " '这': 366,\n",
       " '火': 367,\n",
       " '没见识': 368,\n",
       " '开头': 369,\n",
       " '长': 370,\n",
       " '对决': 371,\n",
       " '戏可算': 372,\n",
       " '华语': 373,\n",
       " '顶尖': 374,\n",
       " '存在': 375,\n",
       " '驱逐舰': 376,\n",
       " '导弹': 377,\n",
       " '坦克': 378,\n",
       " '商业片': 379,\n",
       " '狂用': 380,\n",
       " '镜头': 381,\n",
       " '运用': 382,\n",
       " '笑': 383,\n",
       " '点': 384,\n",
       " '插入': 385,\n",
       " '好莱坞': 386,\n",
       " '爆米花': 387,\n",
       " '不功': 388,\n",
       " '不过': 389,\n",
       " '从头': 390,\n",
       " '打到': 391,\n",
       " '尾': 392,\n",
       " '拼': 393,\n",
       " '虽然': 394,\n",
       " '有略': 395,\n",
       " '乱': 396,\n",
       " '时': 397,\n",
       " '因为': 398,\n",
       " '没': 399,\n",
       " '啥': 400,\n",
       " '期望值': 401,\n",
       " '被': 402,\n",
       " '吓了一跳': 403,\n",
       " '吴刚': 404,\n",
       " '谦和': 405,\n",
       " '丁海峰': 406,\n",
       " '老': 407,\n",
       " '三位': 408,\n",
       " '炖': 409,\n",
       " '烂熟': 410,\n",
       " '牛筋': 411,\n",
       " '嚼': 412,\n",
       " '用心': 413,\n",
       " '啊': 414,\n",
       " '导演': 415,\n",
       " '小看': 416,\n",
       " '确实': 417,\n",
       " '下功夫': 418,\n",
       " '拉': 419,\n",
       " '借鉴': 420,\n",
       " '至于': 421,\n",
       " '大家': 422,\n",
       " '比较': 423,\n",
       " '反感': 424,\n",
       " '情绪': 425,\n",
       " '那些': 426,\n",
       " '桥段': 427,\n",
       " '必备': 428,\n",
       " '稍微': 429,\n",
       " '一点': 430,\n",
       " '还': 431,\n",
       " '可以': 432,\n",
       " '接受': 433,\n",
       " '最好': 434,\n",
       " '地方': 435,\n",
       " '掌握': 436,\n",
       " '张弛': 437,\n",
       " '有度': 438,\n",
       " '这点': 439,\n",
       " '难得': 440,\n",
       " '一直': 441,\n",
       " '脑子里': 442,\n",
       " '回响': 443,\n",
       " '片头': 444,\n",
       " '海里': 445,\n",
       " '那场': 446,\n",
       " '戏看': 447,\n",
       " '完': 448,\n",
       " '呆': 449,\n",
       " '下去': 450,\n",
       " '太假': 451,\n",
       " '提前': 452,\n",
       " '离场': 453,\n",
       " '好看': 454,\n",
       " '演技': 455,\n",
       " '棒呆': 456,\n",
       " '符合': 457,\n",
       " '反而': 458,\n",
       " '更': 459,\n",
       " '差': 460,\n",
       " '这一': 461,\n",
       " '放之四海而皆准': 462,\n",
       " '规律': 463,\n",
       " '场面': 464,\n",
       " '越做越': 465,\n",
       " '然而': 466,\n",
       " '伴随': 467,\n",
       " '特效': 468,\n",
       " '升级': 469,\n",
       " '叙事': 470,\n",
       " '变得': 471,\n",
       " '非常': 472,\n",
       " '凌乱': 473,\n",
       " '格局': 474,\n",
       " '颇': 475,\n",
       " '拍成': 476,\n",
       " '黑鹰坠落': 477,\n",
       " '结果': 478,\n",
       " '撑死': 479,\n",
       " '最多': 480,\n",
       " '只是': 481,\n",
       " '官方': 482,\n",
       " '版': 483,\n",
       " '敢死队': 484,\n",
       " '但论': 485,\n",
       " '自我': 486,\n",
       " '角色定位': 487,\n",
       " '能力': 488,\n",
       " '远': 489,\n",
       " '如同': 490,\n",
       " '演员': 491,\n",
       " '出身': 492,\n",
       " '甄子丹': 493,\n",
       " '喜欢': 494,\n",
       " '不是': 495,\n",
       " '装傻': 496,\n",
       " '真傻': 497,\n",
       " '要不是': 498,\n",
       " '真的': 499,\n",
       " '别的': 500,\n",
       " '可': 501,\n",
       " '肯定': 502,\n",
       " '选': 503,\n",
       " '直男癌': 504,\n",
       " '令人发指': 505,\n",
       " '所有': 506,\n",
       " '剧情': 507,\n",
       " '走向': 508,\n",
       " '九十年代': 509,\n",
       " '那套': 510,\n",
       " '照搬': 511,\n",
       " '审美': 512,\n",
       " '事儿': 513,\n",
       " '一时半会儿': 514,\n",
       " '培养': 515,\n",
       " '出来': 516,\n",
       " '整部': 517,\n",
       " '延续': 518,\n",
       " '风格': 519,\n",
       " '热血': 520,\n",
       " '要': 521,\n",
       " '不错': 522,\n",
       " '适合': 523,\n",
       " '演': 524,\n",
       " '军人': 525,\n",
       " '之前': 526,\n",
       " '片段': 527,\n",
       " '念': 528,\n",
       " '劲儿': 529,\n",
       " '来说': 530,\n",
       " '张翰太违': 531,\n",
       " '一': 532,\n",
       " '一股': 533,\n",
       " '雷阵雨': 534,\n",
       " '画风': 535,\n",
       " '目瞪狗': 536,\n",
       " '瘠薄': 537,\n",
       " '人牛': 538,\n",
       " 'b': 539,\n",
       " '硬道理': 540,\n",
       " '隔壁': 541,\n",
       " '建军': 542,\n",
       " '大爷': 543,\n",
       " '你们': 544,\n",
       " '场景': 545,\n",
       " '战斗': 546,\n",
       " '全线': 547,\n",
       " '打斗': 548,\n",
       " '游走': 549,\n",
       " '审查': 550,\n",
       " '红线': 551,\n",
       " '边界': 552,\n",
       " '政治': 553,\n",
       " '安全': 554,\n",
       " '缝隙': 555,\n",
       " '部': 556,\n",
       " '极具': 557,\n",
       " '煽动': 558,\n",
       " '大片': 559,\n",
       " '制作': 560,\n",
       " '精良': 561,\n",
       " '影片': 562,\n",
       " '请': 563,\n",
       " '多来': 564,\n",
       " '胶卷': 565,\n",
       " '挺差': 566,\n",
       " '过度': 567,\n",
       " '部队': 568,\n",
       " '没太多': 569,\n",
       " '展示': 570,\n",
       " '死去': 571,\n",
       " '反正': 572,\n",
       " '吸引': 573,\n",
       " '冲': 574,\n",
       " '为什么': 575,\n",
       " '鄙视': 576,\n",
       " '敢': 577,\n",
       " '开拓': 578,\n",
       " '允许': 579,\n",
       " '他们': 580,\n",
       " '再': 581,\n",
       " '直到': 582,\n",
       " '更好': 583,\n",
       " '拍出': 584,\n",
       " '棒': 585,\n",
       " '出彩': 586,\n",
       " '呢': 587,\n",
       " '火爆': 588,\n",
       " '本片': 589,\n",
       " '必将': 590,\n",
       " '燃爆': 591,\n",
       " '暑期': 592,\n",
       " '厉害': 593,\n",
       " '身为': 594,\n",
       " '武打': 595,\n",
       " '高标准': 596,\n",
       " '枪战': 597,\n",
       " '为': 598,\n",
       " '点赞': 599,\n",
       " '热血男儿': 600,\n",
       " '荷尔蒙': 601,\n",
       " '爆发': 602,\n",
       " '给': 603,\n",
       " '0': 604,\n",
       " '星': 605,\n",
       " '血战': 606,\n",
       " '钢锯': 607,\n",
       " '岭': 608,\n",
       " '会': 609,\n",
       " '歌颂': 610,\n",
       " '宗教': 611,\n",
       " '情怀': 612,\n",
       " '超越': 613,\n",
       " '政权': 614,\n",
       " '当': 615,\n",
       " '只': 616,\n",
       " '明显': 617,\n",
       " '低': 618,\n",
       " '层次': 619,\n",
       " '充满': 620,\n",
       " '现实': 621,\n",
       " '乃至': 622,\n",
       " '投机': 623,\n",
       " '考量': 624,\n",
       " '高下': 625,\n",
       " '立': 626,\n",
       " '见': 627,\n",
       " '请问': 628,\n",
       " '吴京脑': 629,\n",
       " '残': 630,\n",
       " '火箭炮': 631,\n",
       " '吗': 632,\n",
       " '傲气': 633,\n",
       " '雄鹰': 634,\n",
       " '第一': 635,\n",
       " '滴血': 636,\n",
       " '4': 637,\n",
       " '算是': 638,\n",
       " '国内': 639,\n",
       " '片': 640,\n",
       " '准': 641,\n",
       " '钱': 642,\n",
       " '花': 643,\n",
       " '有效': 644,\n",
       " '气魄': 645,\n",
       " '创作': 646,\n",
       " '足够': 647,\n",
       " '真诚': 648,\n",
       " '人物': 649,\n",
       " '连': 650,\n",
       " '张翰': 651,\n",
       " '可爱': 652,\n",
       " '如果': 653,\n",
       " '当年': 654,\n",
       " '那样': 655,\n",
       " '一时': 656,\n",
       " '膨胀': 657,\n",
       " '银幕': 658,\n",
       " '独占': 659,\n",
       " '聚光灯': 660,\n",
       " '走': 661,\n",
       " '扪心自问': 662,\n",
       " '没法': 663,\n",
       " '评价': 664,\n",
       " '全片': 665,\n",
       " '靠': 666,\n",
       " '戏撑': 667,\n",
       " '文戏': 668,\n",
       " '扯淡': 669,\n",
       " '女主角': 670,\n",
       " '毫无': 671,\n",
       " '必要': 672,\n",
       " '只要': 673,\n",
       " '开挂': 674,\n",
       " '牛': 675,\n",
       " '逼': 676,\n",
       " '之处': 677,\n",
       " '在于': 678,\n",
       " '透露': 679,\n",
       " '极': 680,\n",
       " '强烈': 681,\n",
       " '意识形态': 682,\n",
       " '枷锁': 683,\n",
       " '祖国': 684,\n",
       " '面前': 685,\n",
       " '一切': 686,\n",
       " '反动派': 687,\n",
       " '纸老虎': 688,\n",
       " '人开': 689,\n",
       " '挂': 690,\n",
       " '团灭': 691,\n",
       " '合情合理': 692,\n",
       " '两星': 693,\n",
       " '鼓励': 694,\n",
       " '其他': 695,\n",
       " '一般般': 696,\n",
       " '看点': 697,\n",
       " '有点': 698,\n",
       " '手接': 699,\n",
       " '哈哈哈': 700,\n",
       " '从': 701,\n",
       " '之后': 702,\n",
       " '炸': 703,\n",
       " '翻': 704,\n",
       " '一下': 705,\n",
       " '四星': 706,\n",
       " '当时': 707,\n",
       " '其实': 708,\n",
       " '完成度': 709,\n",
       " '接近': 710,\n",
       " '每个': 711,\n",
       " '步骤': 712,\n",
       " '顺滑': 713,\n",
       " '任何': 714,\n",
       " '出人意料': 715,\n",
       " '是因为': 716,\n",
       " '看看': 717,\n",
       " '最近': 718,\n",
       " '世界': 719,\n",
       " '抱歉': 720,\n",
       " '影院': 721,\n",
       " '燃': 722,\n",
       " '起来': 723,\n",
       " '魔幻': 724,\n",
       " '当然': 725,\n",
       " '强拆': 726,\n",
       " '现实感': 727,\n",
       " '一幕': 728,\n",
       " '开场': 729,\n",
       " '6': 730,\n",
       " '搏斗': 731,\n",
       " '从来': 732,\n",
       " '其它': 733,\n",
       " '拍摄': 734,\n",
       " '难度': 735,\n",
       " '同时': 736,\n",
       " '技能': 737,\n",
       " '方面': 738,\n",
       " '要求': 739,\n",
       " '回来': 740,\n",
       " '搜': 741,\n",
       " '吴京会': 742,\n",
       " '游泳': 743,\n",
       " '潜水': 744,\n",
       " '滑雪': 745,\n",
       " '开': 746,\n",
       " '飞机': 747,\n",
       " '射击': 748,\n",
       " '各项': 749,\n",
       " '特意': 750,\n",
       " '特种部队': 751,\n",
       " '当过': 752,\n",
       " '18': 753,\n",
       " '月': 754,\n",
       " '兵': 755,\n",
       " '佩服': 756,\n",
       " '这样': 757,\n",
       " '3': 758,\n",
       " '星半': 759,\n",
       " '结束': 760,\n",
       " '掌声': 761,\n",
       " '出现': 762,\n",
       " '近期': 763,\n",
       " '少见': 764,\n",
       " '一粒': 765,\n",
       " '大补丸': 766,\n",
       " '有人': 767,\n",
       " '吃': 768,\n",
       " '开心': 769,\n",
       " '补大': 770,\n",
       " '从白': 771,\n",
       " '黑': 772,\n",
       " '字幕': 773,\n",
       " '展现': 774,\n",
       " '超级': 775,\n",
       " '直': 776,\n",
       " '男': 777,\n",
       " '糙': 778,\n",
       " '猛': 779,\n",
       " '媲美': 780,\n",
       " '终结者': 781,\n",
       " '5': 782,\n",
       " '无亮点': 783,\n",
       " '张翰变': 784,\n",
       " '谐星': 785,\n",
       " '3d': 786,\n",
       " '掌控': 787,\n",
       " '逼近': 788,\n",
       " 'hold': 789,\n",
       " '不住': 790,\n",
       " '边缘': 791,\n",
       " '带感': 792,\n",
       " '拳拳': 793,\n",
       " '肉': 794,\n",
       " '超爽': 795,\n",
       " '聪明': 796,\n",
       " '鸡': 797,\n",
       " '贼': 798,\n",
       " '一面': 799,\n",
       " '旗下': 800,\n",
       " '呈现': 801,\n",
       " '一出': 802,\n",
       " '重工业': 803,\n",
       " '娱乐': 804,\n",
       " '调控': 805,\n",
       " '说教': 806,\n",
       " '比例': 807,\n",
       " '尺度': 808,\n",
       " '大众': 809,\n",
       " '接纳': 810,\n",
       " '把握': 811,\n",
       " '微妙': 812,\n",
       " '其中': 813,\n",
       " '一些': 814,\n",
       " '奇侠': 815,\n",
       " '化': 816,\n",
       " '内容': 817,\n",
       " '比如': 818,\n",
       " '玻璃碴': 819,\n",
       " '子当': 820,\n",
       " '飞镖': 821,\n",
       " '杀敌': 822,\n",
       " '一类': 823,\n",
       " '只不过': 824,\n",
       " '遮盖': 825,\n",
       " '掉': 826,\n",
       " '老爹': 827,\n",
       " '演过': 828,\n",
       " '美剧': 829,\n",
       " '搏击': 830,\n",
       " '王国': 831,\n",
       " '力荐': 832,\n",
       " '那部': 833,\n",
       " '为啥': 834,\n",
       " '奇异': 835,\n",
       " '恩典': 836,\n",
       " '配乐': 837,\n",
       " '画内': 838,\n",
       " '男生': 839,\n",
       " '的话': 840,\n",
       " '应该': 841,\n",
       " '刺激': 842,\n",
       " '肾上腺素': 843,\n",
       " '女生': 844,\n",
       " '对龙小云': 845,\n",
       " '感情': 846,\n",
       " '十分': 847,\n",
       " '打动': 848,\n",
       " '模仿': 849,\n",
       " '许多': 850,\n",
       " '怎么': 851,\n",
       " '玩': 852,\n",
       " '一股脑': 853,\n",
       " '堆': 854,\n",
       " '槽': 855,\n",
       " '几位': 856,\n",
       " '血厚到': 857,\n",
       " '科幻': 858,\n",
       " '级别': 859,\n",
       " '重复': 860,\n",
       " '满血': 861,\n",
       " '红血': 862,\n",
       " '中毒': 863,\n",
       " '极速': 864,\n",
       " '回血': 865,\n",
       " '爆种': 866,\n",
       " '打通': 867,\n",
       " '全场': 868,\n",
       " '太过': 869,\n",
       " '投机取巧': 870,\n",
       " '穿': 871,\n",
       " '迈克尔': 872,\n",
       " '贝都': 873,\n",
       " '不受': 874,\n",
       " '待见': 875,\n",
       " '国片': 876,\n",
       " '前仆后继': 877,\n",
       " '爆炸': 878,\n",
       " 'high': 879,\n",
       " '瞎燃': 880,\n",
       " '没用': 881,\n",
       " '10': 882,\n",
       " '女人': 883,\n",
       " '缺': 884,\n",
       " '男人': 885,\n",
       " '征服': 886,\n",
       " '吴京直': 887,\n",
       " '男癌': 888,\n",
       " '🇨': 889,\n",
       " '🇳': 890,\n",
       " '美国': 891,\n",
       " '不行': 892,\n",
       " '死': 893,\n",
       " '全都': 894,\n",
       " '跳': 895,\n",
       " '跟': 896,\n",
       " '跳墙': 897,\n",
       " '一样': 898,\n",
       " '拯救': 899,\n",
       " '国产片': 900,\n",
       " '以': 901,\n",
       " '中印': 902,\n",
       " '局势': 903,\n",
       " '对比': 904,\n",
       " '假想': 905,\n",
       " '真是': 906,\n",
       " '讽刺': 907,\n",
       " '谄媚': 908,\n",
       " '军旅': 909,\n",
       " '题材': 910,\n",
       " '质感': 911,\n",
       " '燃到': 912,\n",
       " '国外': 913,\n",
       " '精彩': 914,\n",
       " '看着': 915,\n",
       " '有力': 916,\n",
       " '必须': 917,\n",
       " '安利': 918,\n",
       " '一下张': 919,\n",
       " '翰': 920,\n",
       " '简直': 921,\n",
       " '承包': 922,\n",
       " '笑点': 923,\n",
       " '量身定做': 924,\n",
       " '彭于': 925,\n",
       " '晏': 926,\n",
       " '可演': 927,\n",
       " '不来': 928,\n",
       " '不少': 929,\n",
       " '漂移': 930,\n",
       " '无人机': 931,\n",
       " '突袭': 932,\n",
       " '直升机': 933,\n",
       " '坠露': 934,\n",
       " '肉搏': 935,\n",
       " '军舰': 936,\n",
       " '发射': 937,\n",
       " '叛乱': 938,\n",
       " '国际化': 939,\n",
       " '视角': 940,\n",
       " '标配': 941,\n",
       " '饰演': 942,\n",
       " '深入人心': 943,\n",
       " '搏命': 944,\n",
       " '精神': 945,\n",
       " '当下': 946,\n",
       " '第三部': 947,\n",
       " '好燃': 948,\n",
       " '表白': 949,\n",
       " '典型': 950,\n",
       " '方式': 951,\n",
       " '每次': 952,\n",
       " '猜': 953,\n",
       " '没劲': 954,\n",
       " '诶': 955,\n",
       " '问': 956,\n",
       " '王牌': 957,\n",
       " '特工': 958,\n",
       " '那么': 959,\n",
       " '杀人': 960,\n",
       " '经过': 961,\n",
       " '艺术': 962,\n",
       " '处理': 963,\n",
       " '直接': 964,\n",
       " '删': 965,\n",
       " '血腥': 966,\n",
       " '屠杀': 967,\n",
       " '赤裸裸': 968,\n",
       " '大段': 969,\n",
       " '正确': 970,\n",
       " '庇衣': 971,\n",
       " '意料之中': 972,\n",
       " '意料之外': 973,\n",
       " '惊喜': 974,\n",
       " '属于': 975,\n",
       " '狼性': 976,\n",
       " '军魂': 977,\n",
       " '几个': 978,\n",
       " '网红拉': 979,\n",
       " '弹弹琴': 980,\n",
       " '大国': 981,\n",
       " '气象': 982,\n",
       " '满屏': 983,\n",
       " 'tm': 984,\n",
       " '告诉': 985,\n",
       " '吴': 986,\n",
       " '迪塞尔': 987,\n",
       " '如入无人之境': 988,\n",
       " '亿': 989,\n",
       " '大陆': 990,\n",
       " '一刻': 991,\n",
       " '集体': 992,\n",
       " '勃起': 993,\n",
       " '离开': 994,\n",
       " '影厅': 995,\n",
       " '屌丝': 996,\n",
       " '同样': 997,\n",
       " '开始': 998,\n",
       " '前': 999,\n",
       " ...}"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "word2index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 评论中没有词向量中的词\n",
    "- 解决措施：继续分词\n",
    "- 拼写纠错：如“鸡冻”--> “激动”"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "def check_coverage(vocab, wv):\n",
    "    known_words = {}\n",
    "    unknown_words = {}\n",
    "    no_known_words = 0\n",
    "    no_unknown_words = 0\n",
    "    for word in vocab:\n",
    "        try:\n",
    "            known_words[word] = wv[word]\n",
    "            no_known_words += vocab[word]\n",
    "        except:\n",
    "            unknown_words[word] = vocab[word]\n",
    "            no_unknown_words += vocab[word]\n",
    "    print('词汇表中 {:.2%} 的单词有词向量'.format(len(known_words) / len(vocab)))\n",
    "    print('评论的所有单词中 {:.2%} 的单词有词向量'.format(\n",
    "        no_known_words / (no_known_words + no_unknown_words)))\n",
    "    unknown_words = sorted(unknown_words.items(),\n",
    "                           key=operator.itemgetter(1))[::-1]\n",
    "    return unknown_words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "词汇表中 69.33% 的单词有词向量\n",
      "评论的所有单词中 97.05% 的单词有词向量\n"
     ]
    }
   ],
   "source": [
    "unknown_words = check_coverage(vocab, wv)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "42851"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(unknown_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('ahhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhotdwarvesatyourservice',\n",
       "  1),\n",
       " ('blahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblah',\n",
       "  1),\n",
       " ('hiusjufbjijhjjsfightdjsiosnwnbombsisjwnexplosionwhshbdbwwhatfuckjusthappenddhsjsndhdjfighrekizjekilldbiejsshotdhsjsnthatbitchsjdjjffuckjejsh',\n",
       "  1),\n",
       " ('bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbest',\n",
       "  2),\n",
       " ('bce43fd38b93b6c518d056245a8a4de28880c502e41eebc1978142bcb20fadc9bc54d7cbb6a0',\n",
       "  1),\n",
       " ('prprprprprprprprprprprprprprprprprprprprpr', 2),\n",
       " ('97db28f635ac65285bac7790ca7a36ca2234679201', 1),\n",
       " ('3a3e3c800a74582d2295df84c6624848bd6ed54a', 2),\n",
       " ('4ee833da28c2d1e1b7b377769038f2a699eb7394', 2),\n",
       " ('a731364e7b392af2f5b7bc897328cba450f9f95d', 2),\n",
       " ('1dfd74a56054b289a4a244630d7ed1e70687f21b', 2),\n",
       " ('9aa786a606eee1f6a26c3411a67731de2dd7c877', 2),\n",
       " ('d9dea7ecb35d8b2d1c4b9141f7d8a18a4f24c559', 2),\n",
       " ('e022283c58bf58aa733e61d2977ca60e513a0e14', 1),\n",
       " ('5438712a5b1db87f0e5e69d9f1fe21e023864e14', 1),\n",
       " ('251f2c4a8392d2c4c73d7798d666c65690bfe1af', 1),\n",
       " ('7f3467e5f6ba2b866c1ef7029a113db4c33311dc', 1),\n",
       " ('523452693ed265f9de29558b4cfc1960c895b24a', 1),\n",
       " ('a5f5bf93f4a1fcfcc688b2b3255e0a20958751f2', 1),\n",
       " ('d6b6d9084ddc8542b188f6681d1c085f51dfb7e1', 1),\n",
       " ('dw5pb25fawq9mtaymjezxzewmdawml8wmv8wmq', 1),\n",
       " ('71a5640b2f22570da2243cd6711589fb', 1),\n",
       " ('6447eb7002ef7802286a08e7a47203fa', 1),\n",
       " ('222222222222222222222222222222', 1),\n",
       " ('ze0x6rsngqcnvaimpbjfubxdrrttrg', 1),\n",
       " ('hahahahahahahahahaahahahahahah', 1),\n",
       " ('blablablablablablablablablabla', 1),\n",
       " ('shelookslikeayoungmonicavitti', 1),\n",
       " ('woooooooooooooooooooooooooow', 2),\n",
       " ('jhjhkjkjhhgjrdswjkmkljijijme', 1),\n",
       " ('bqgp1zuouiykw4mx0mclymm5q', 1),\n",
       " ('lannnnnnnnnnnnnnnnnnnnnn', 1),\n",
       " ('0427bgm8fowuak80jbudny3n', 1),\n",
       " ('649595e086366a104da0f09b', 1),\n",
       " ('f82708b1de47a24adf0f499c', 1),\n",
       " ('lowlowlowlowlowlowlowlow', 1),\n",
       " ('iookkgsyuwqgduxsacxklaj', 1),\n",
       " ('mirrennnnnnnnnnnnnnnnnn', 1),\n",
       " ('ttttttttttttttttttttttt', 1),\n",
       " ('d6pohvfblei7h10ndve7zq', 1),\n",
       " ('makeyourselfaguardian', 1),\n",
       " ('caonimagewangbagaozi', 4),\n",
       " ('freeeeeeeeeeeeeeedom', 2),\n",
       " ('benzbenzbenzbenzbenz', 1),\n",
       " ('gainsbourgcharlotte', 3),\n",
       " ('chitatatachitachita', 2),\n",
       " ('reaaaaaaaaaaaaaally', 1),\n",
       " ('hahahahahahahahahah', 1),\n",
       " ('nnnnnnnnnnnnnnnnnnn', 1),\n",
       " ('jtrqagj6qqddovdsaf9', 1)]"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sorted(unknown_words, key=lambda w:len(w[0]), reverse=True)[:50]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('2', 3789),\n",
       " ('3', 3620),\n",
       " ('1', 3510),\n",
       " ('5', 3353),\n",
       " ('4', 2237),\n",
       " ('7', 1719),\n",
       " ('8', 1516),\n",
       " ('10', 1476),\n",
       " ('6', 1216),\n",
       " ('3d', 1055),\n",
       " ('0', 919),\n",
       " ('9', 822),\n",
       " ('20', 660),\n",
       " ('12', 535),\n",
       " ('90', 501),\n",
       " ('30', 445),\n",
       " ('加一星', 443),\n",
       " ('11', 432),\n",
       " ('星给', 420),\n",
       " ('80', 413),\n",
       " ('一般般', 410),\n",
       " ('⋯', 386),\n",
       " ('15', 341),\n",
       " ('100', 340),\n",
       " ('007', 322),\n",
       " ('13', 315),\n",
       " ('50', 272),\n",
       " ('2016', 269),\n",
       " ('2008', 268),\n",
       " ('2012', 260),\n",
       " ('2015', 249),\n",
       " ('2014', 242),\n",
       " ('2013', 239),\n",
       " ('14', 236),\n",
       " ('’', 233),\n",
       " ('18', 228),\n",
       " ('17', 226),\n",
       " ('60', 223),\n",
       " ('cctv6', 216),\n",
       " ('21', 213),\n",
       " ('2010', 210),\n",
       " ('太赞', 209),\n",
       " ('╯', 205),\n",
       " ('2017', 205),\n",
       " ('70', 204),\n",
       " ('￣', 203),\n",
       " ('‘', 202),\n",
       " ('2009', 201),\n",
       " ('没劲', 200),\n",
       " ('╰', 198)]"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sorted(unknown_words, key=lambda w:w[1], reverse=True)[:50]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'2' in wv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### 没有词向量的汉语短语"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "def is_chinese(str):\n",
    "    for s in str:\n",
    "        if u'\\u4e00' <= s <= u'\\u9fff':\n",
    "            return True\n",
    "    return False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "is_chinese('君子之交淡如水')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "37180"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "unknown_chinese=[(w,c) for w, c in unknown_words if is_chinese(w)]\n",
    "len(unknown_chinese)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('九百六十万平方公里', 1),\n",
       " ('丈二和尚摸不着头脑', 1),\n",
       " ('穷人的孩子早当家', 1),\n",
       " ('百尺竿头更进一步', 1),\n",
       " ('八仙过海各显神通', 1),\n",
       " ('君子之交淡如水', 10),\n",
       " ('树欲静而风不止', 7),\n",
       " ('竹篮打水一场空', 7),\n",
       " ('天生我材必有用', 4),\n",
       " ('天下乌鸦一般黑', 3)]"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sorted(unknown_chinese, key=lambda w:len(w[0]), reverse=True)[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('加一星', 443),\n",
       " ('星给', 420),\n",
       " ('一般般', 410),\n",
       " ('太赞', 209),\n",
       " ('没劲', 200),\n",
       " ('带感', 177),\n",
       " ('尿点', 155),\n",
       " ('脑残粉', 152),\n",
       " ('残粉', 146),\n",
       " ('看得人', 146)]"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sorted(unknown_chinese, key=lambda w:w[1], reverse=True)[:10]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- 没有词向量的汉语短语，前向最大匹配继续分词"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "def cut(text):\n",
    "    words = []\n",
    "    while text:\n",
    "        seg = text\n",
    "        n = len(seg)\n",
    "        for j in range(n, 0, -1):\n",
    "            w = seg[:j]\n",
    "            if w in wv:\n",
    "                words.append(w)\n",
    "                text = text[j:]\n",
    "                break\n",
    "            if len(w) == 1 and not w.isspace():\n",
    "                text = text[j:]\n",
    "    return words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['九百', '六十万', '平方公里']\n",
      "['丈二', '和尚', '摸不着头脑']\n",
      "['穷人', '的', '孩子', '早', '当家']\n",
      "['百尺竿头', '更进一步']\n",
      "['八仙过海', '各显神通']\n",
      "['君子之交', '淡如水']\n",
      "['树', '欲', '静', '而风', '不止']\n",
      "['竹篮', '打水', '一场空']\n",
      "['天生', '我材', '必有用']\n",
      "['天下', '乌鸦', '一般', '黑']\n"
     ]
    }
   ],
   "source": [
    "for w, _ in sorted(unknown_chinese, key=lambda w: len(w[0]),\n",
    "                   reverse=True)[:10]:\n",
    "    print(cut(w))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['加一', '星']\n",
      "['星', '给']\n",
      "['一般', '般']\n",
      "['太', '赞']\n",
      "['没', '劲']\n",
      "['带', '感']\n",
      "['尿', '点']\n",
      "['脑残', '粉']\n",
      "['残', '粉']\n",
      "['看得', '人']\n"
     ]
    }
   ],
   "source": [
    "for w, _ in sorted(unknown_chinese, key=lambda w: w[1], reverse=True)[:10]:\n",
    "    print(cut(w))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### 全英文字符短语"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "def is_string(str):\n",
    "    for s in str:\n",
    "        if s not in string.ascii_lowercase:\n",
    "            return False\n",
    "    return True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2306"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "unknown_string=[(w,c) for w, c in unknown_words if is_string(w)]\n",
    "len(unknown_string)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('dvdrip', 70),\n",
       " ('bdrip', 18),\n",
       " ('ccav', 17),\n",
       " ('bjiff', 15),\n",
       " ('hitchitsch', 15),\n",
       " ('btih', 15),\n",
       " ('johnnydepp', 14),\n",
       " ('jlo', 13),\n",
       " ('xxoo', 13),\n",
       " ('nnd', 13),\n",
       " ('willsmith', 12),\n",
       " ('xmen', 12),\n",
       " ('yyets', 12),\n",
       " ('mdzz', 12),\n",
       " ('kevinspacey', 10),\n",
       " ('exm', 10),\n",
       " ('clinteastwood', 10),\n",
       " ('mlgb', 10),\n",
       " ('happyending', 9),\n",
       " ('blabla', 9),\n",
       " ('prpr', 9),\n",
       " ('qwq', 9),\n",
       " ('tomhanks', 9),\n",
       " ('megryan', 8),\n",
       " ('cinematheque', 8),\n",
       " ('xddd', 8),\n",
       " ('jimcarrey', 8),\n",
       " ('dxy', 7),\n",
       " ('rylance', 7),\n",
       " ('woodyallen', 7),\n",
       " ('bluesliver', 7),\n",
       " ('robertdeniro', 7),\n",
       " ('congroo', 7),\n",
       " ('gscas', 7),\n",
       " ('jasonstatham', 7),\n",
       " ('quq', 7),\n",
       " ('balmes', 7),\n",
       " ('undatable', 6),\n",
       " ('undateable', 6),\n",
       " ('depressing', 6),\n",
       " ('kyxq', 6),\n",
       " ('amiable', 6),\n",
       " ('bingbong', 6),\n",
       " ('clotaire', 6),\n",
       " ('angelinajolie', 6),\n",
       " ('hughjackman', 6),\n",
       " ('qmdb', 6),\n",
       " ('timburton', 6),\n",
       " ('prevarticle', 6),\n",
       " ('maggieq', 6),\n",
       " ('nicolascage', 6),\n",
       " ('tnnd', 6),\n",
       " ('piapiapia', 6),\n",
       " ('vikander', 6),\n",
       " ('hanmeimei', 6),\n",
       " ('panzerlied', 5),\n",
       " ('agyness', 5),\n",
       " ('almodovar', 5),\n",
       " ('sonakshi', 5),\n",
       " ('lindsaylohan', 5),\n",
       " ('coens', 5),\n",
       " ('juliaroberts', 5),\n",
       " ('hahahaha', 5),\n",
       " ('deniro', 5),\n",
       " ('halfcd', 5),\n",
       " ('moonriver', 5),\n",
       " ('wlgc', 5),\n",
       " ('sharlto', 5),\n",
       " ('blablabla', 5),\n",
       " ('qvq', 5),\n",
       " ('annehathaway', 5),\n",
       " ('ppps', 5),\n",
       " ('mediocre', 5),\n",
       " ('cucurrucucu', 4),\n",
       " ('makus', 4),\n",
       " ('kevincostner', 4),\n",
       " ('pissed', 4),\n",
       " ('garrn', 4),\n",
       " ('paulnewman', 4),\n",
       " ('isrenhe', 4),\n",
       " ('danieldaylewis', 4),\n",
       " ('aibileen', 4),\n",
       " ('pathetic', 4),\n",
       " ('bugger', 4),\n",
       " ('xunlei', 4),\n",
       " ('eits', 4),\n",
       " ('tspdt', 4),\n",
       " ('claflin', 4),\n",
       " ('sucked', 4),\n",
       " ('reservoirbuns', 4),\n",
       " ('angryalien', 4),\n",
       " ('harrisonford', 4),\n",
       " ('alpacino', 4),\n",
       " ('imbt', 4),\n",
       " ('wqnmlgb', 4),\n",
       " ('doulist', 4),\n",
       " ('ttatt', 4),\n",
       " ('yunpan', 4),\n",
       " ('tomcruise', 4),\n",
       " ('caonimagewangbagaozi', 4),\n",
       " ('vindiesel', 4),\n",
       " ('hiahiahia', 4),\n",
       " ('blahblah', 4),\n",
       " ('weixin', 4),\n",
       " ('sociopath', 4),\n",
       " ('wwwww', 4),\n",
       " ('uptobox', 4),\n",
       " ('himym', 4),\n",
       " ('stevenspielberg', 4),\n",
       " ('biubiubiu', 4),\n",
       " ('bingka', 4),\n",
       " ('happyness', 4),\n",
       " ('stroszek', 3),\n",
       " ('sver', 3),\n",
       " ('fvck', 3),\n",
       " ('milland', 3),\n",
       " ('perverted', 3),\n",
       " ('queenb', 3),\n",
       " ('pauldano', 3),\n",
       " ('piaohua', 3),\n",
       " ('documentaire', 3),\n",
       " ('banal', 3),\n",
       " ('couscous', 3),\n",
       " ('meilleure', 3),\n",
       " ('grym', 3),\n",
       " ('gainsbourgcharlotte', 3),\n",
       " ('johncusack', 3),\n",
       " ('pretentious', 3),\n",
       " ('ctrlhd', 3),\n",
       " ('dualaudio', 3),\n",
       " ('fasten', 3),\n",
       " ('bdiso', 3),\n",
       " ('dardennes', 3),\n",
       " ('taphore', 3),\n",
       " ('merylstreep', 3),\n",
       " ('filmoteca', 3),\n",
       " ('chbosky', 3),\n",
       " ('psychos', 3),\n",
       " ('wonderfully', 3),\n",
       " ('bitterness', 3),\n",
       " ('subtlety', 3),\n",
       " ('hughgrant', 3),\n",
       " ('cnxp', 3),\n",
       " ('bingbang', 3),\n",
       " ('magnifique', 3),\n",
       " ('yculblog', 3),\n",
       " ('cmct', 3),\n",
       " ('denzelwashington', 3),\n",
       " ('everthing', 3),\n",
       " ('ttutt', 3),\n",
       " ('evagreen', 3),\n",
       " ('johntravolta', 3),\n",
       " ('edwardnorton', 3),\n",
       " ('bradpitt', 3),\n",
       " ('xiami', 3),\n",
       " ('funnier', 3),\n",
       " ('sayin', 3),\n",
       " ('tbbt', 3),\n",
       " ('hollyfood', 3),\n",
       " ('tttattt', 3),\n",
       " ('tieba', 3),\n",
       " ('anglebaby', 3),\n",
       " ('dustinhoffman', 3),\n",
       " ('symbolis', 3),\n",
       " ('georgeclooney', 3),\n",
       " ('luxiang', 3),\n",
       " ('simonpegg', 3),\n",
       " ('youself', 3),\n",
       " ('albumplay', 3),\n",
       " ('prprpr', 3),\n",
       " ('duangduangduang', 3),\n",
       " ('judelaw', 3),\n",
       " ('hassells', 3),\n",
       " ('zhihu', 3),\n",
       " ('hkaff', 3),\n",
       " ('urself', 3),\n",
       " ('dennings', 3),\n",
       " ('hxm', 3),\n",
       " ('saysaysay', 3),\n",
       " ('sarabandi', 3),\n",
       " ('fereshte', 3),\n",
       " ('seddiqi', 3),\n",
       " ('bahare', 3),\n",
       " ('farrokh', 3),\n",
       " ('katewinslet', 3),\n",
       " ('mcmurphy', 3),\n",
       " ('cooooool', 3),\n",
       " ('jasonbourne', 3),\n",
       " ('buleliongb', 3),\n",
       " ('sooooo', 3),\n",
       " ('unbelieveble', 2),\n",
       " ('surtout', 2),\n",
       " ('xiaomei', 2),\n",
       " ('bizzar', 2),\n",
       " ('photog', 2),\n",
       " ('filmlinc', 2),\n",
       " ('haaappy', 2),\n",
       " ('tomodaji', 2),\n",
       " ('douna', 2),\n",
       " ('escapar', 2),\n",
       " ('pudiera', 2),\n",
       " ('afecta', 2),\n",
       " ('harshness', 2),\n",
       " ('dysfz', 2),\n",
       " ('louka', 2),\n",
       " ('relatability', 2),\n",
       " ('thanasewee', 2),\n",
       " ('chantawit', 2),\n",
       " ('helloivan', 2),\n",
       " ('leyendecker', 2),\n",
       " ('hipohop', 2),\n",
       " ('aaroneckhart', 2),\n",
       " ('lauralinney', 2),\n",
       " ('posent', 2),\n",
       " ('suikie', 2),\n",
       " ('looooooooooooooove', 2),\n",
       " ('fantasitic', 2),\n",
       " ('leoscarax', 2),\n",
       " ('uxxxx', 2),\n",
       " ('isetta', 2),\n",
       " ('traceyullman', 2),\n",
       " ('eeer', 2),\n",
       " ('orsonwelles', 2),\n",
       " ('rosemaryai', 2),\n",
       " ('shenyuegk', 2),\n",
       " ('adrek', 2),\n",
       " ('imagineers', 2),\n",
       " ('nerders', 2),\n",
       " ('sifi', 2),\n",
       " ('bbbbbbbbbbbbbbbb', 2),\n",
       " ('suxi', 2),\n",
       " ('worest', 2),\n",
       " ('xinxin', 2),\n",
       " ('aamzing', 2),\n",
       " ('davidmamet', 2),\n",
       " ('morricon', 2),\n",
       " ('freewheeling', 2),\n",
       " ('stealthily', 2),\n",
       " ('drehbuch', 2),\n",
       " ('gutes', 2),\n",
       " ('xmtmxntuwnzi', 2),\n",
       " ('zhenhan', 2),\n",
       " ('wastwood', 2),\n",
       " ('bashers', 2),\n",
       " ('oharu', 2),\n",
       " ('bedeviled', 2),\n",
       " ('newspaperman', 2),\n",
       " ('scumbag', 2),\n",
       " ('sensationnalisme', 2),\n",
       " ('hypocrisie', 2),\n",
       " ('visionnaire', 2),\n",
       " ('cynisme', 2),\n",
       " ('immoralit', 2),\n",
       " ('manipulateur', 2),\n",
       " ('diatique', 2),\n",
       " ('ardemment', 2),\n",
       " ('prenant', 2),\n",
       " ('blogbus', 2),\n",
       " ('characterizations', 2),\n",
       " ('foriegn', 2),\n",
       " ('arimura', 2),\n",
       " ('cillianmurphy', 2),\n",
       " ('jwj', 2),\n",
       " ('yidia', 2),\n",
       " ('yidian', 2),\n",
       " ('karner', 2),\n",
       " ('nerdz', 2),\n",
       " ('gaydar', 2),\n",
       " ('duplass', 2),\n",
       " ('interconnected', 2),\n",
       " ('lobud', 2),\n",
       " ('chitatatachitachita', 2),\n",
       " ('sharina', 2),\n",
       " ('ezellweger', 2),\n",
       " ('alikhan', 2),\n",
       " ('hqc', 2),\n",
       " ('demonii', 2),\n",
       " ('openbittorrent', 2),\n",
       " ('jjlin', 2),\n",
       " ('sincity', 2),\n",
       " ('dumsday', 2),\n",
       " ('balbalba', 2),\n",
       " ('sallu', 2),\n",
       " ('dickhead', 2),\n",
       " ('twatt', 2),\n",
       " ('untruth', 2),\n",
       " ('testicles', 2),\n",
       " ('ronk', 2),\n",
       " ('llewy', 2),\n",
       " ('effortless', 2),\n",
       " ('malheure', 2),\n",
       " ('mazel', 2),\n",
       " ('cazele', 2),\n",
       " ('cazale', 2),\n",
       " ('licat', 2),\n",
       " ('pukka', 2),\n",
       " ('sooooooooooo', 2),\n",
       " ('pulpfiction', 2),\n",
       " ('benchetrit', 2),\n",
       " ('doute', 2),\n",
       " ('pouill', 2),\n",
       " ('yureru', 2),\n",
       " ('conceptualisation', 2),\n",
       " ('identitaire', 2),\n",
       " ('fusionnement', 2),\n",
       " ('registres', 2),\n",
       " ('entrelacement', 2),\n",
       " ('frappant', 2),\n",
       " ('dramatis', 2),\n",
       " ('excessivement', 2),\n",
       " ('disant', 2),\n",
       " ('tralis', 2),\n",
       " ('soigneusement', 2),\n",
       " ('ussissent', 2),\n",
       " ('bsflifwmkgie', 2),\n",
       " ('ressent', 2),\n",
       " ('attachants', 2),\n",
       " ('authenticit', 2),\n",
       " ('justesse', 2),\n",
       " ('raciale', 2),\n",
       " ('yiyi', 2),\n",
       " ('rym', 2),\n",
       " ('shareid', 2),\n",
       " ('taun', 2),\n",
       " ('downshifting', 2),\n",
       " ('outfest', 2),\n",
       " ('splashes', 2),\n",
       " ('tianshi', 2),\n",
       " ('qiluo', 2),\n",
       " ('taxidermia', 2),\n",
       " ('jamesmcavoy', 2),\n",
       " ('freinds', 2),\n",
       " ('angness', 2),\n",
       " ('believable', 2),\n",
       " ('exagerated', 2),\n",
       " ('balabalabalabala', 2),\n",
       " ('joeking', 2),\n",
       " ('warmly', 2),\n",
       " ('rockli', 2),\n",
       " ('bility', 2),\n",
       " ('proudest', 2),\n",
       " ('hhhhh', 2),\n",
       " ('pinkfloyd', 2),\n",
       " ('simplicit', 2),\n",
       " ('idk', 2),\n",
       " ('gayet', 2),\n",
       " ('pignon', 2),\n",
       " ('trement', 2),\n",
       " ('enchev', 2),\n",
       " ('xmjazmzuznjky', 2),\n",
       " ('xzd', 2),\n",
       " ('mamapapa', 2),\n",
       " ('pmcg', 2),\n",
       " ('daydayup', 2),\n",
       " ('goodgoodstudy', 2),\n",
       " ('palminteri', 2),\n",
       " ('nihilist', 2),\n",
       " ('morons', 2),\n",
       " ('rcokwell', 2),\n",
       " ('newmov', 2),\n",
       " ('kyqx', 2),\n",
       " ('fellas', 2),\n",
       " ('srone', 2),\n",
       " ('didnt', 2),\n",
       " ('voicelesss', 2),\n",
       " ('moldy', 2),\n",
       " ('hilter', 2),\n",
       " ('coseplay', 2),\n",
       " ('aprproduct', 2),\n",
       " ('highot', 2),\n",
       " ('inperfectly', 2),\n",
       " ('peckinpah', 2),\n",
       " ('fking', 2),\n",
       " ('chdbits', 2),\n",
       " ('bestestest', 2),\n",
       " ('dannel', 2),\n",
       " ('furnishes', 2),\n",
       " ('nonjudgmental', 2),\n",
       " ('admirably', 2),\n",
       " ('cryt', 2),\n",
       " ('hehehe', 2),\n",
       " ('kingsmill', 2),\n",
       " ('okuribito', 2),\n",
       " ('notiong', 2),\n",
       " ('cbvivi', 2),\n",
       " ('satc', 2),\n",
       " ('tommorrow', 2),\n",
       " ('tayor', 2),\n",
       " ('jeoy', 2),\n",
       " ('weiz', 2),\n",
       " ('fullness', 2),\n",
       " ('hahahah', 2),\n",
       " ('playindex', 2),\n",
       " ('overlong', 2),\n",
       " ('tougher', 2),\n",
       " ('listplay', 2),\n",
       " ('teachs', 2),\n",
       " ('reverlution', 2),\n",
       " ('twanging', 2),\n",
       " ('euff', 2),\n",
       " ('norch', 2),\n",
       " ('tarsem', 2),\n",
       " ('terrygilliam', 2),\n",
       " ('zooeydeschanel', 2),\n",
       " ('annasophia', 2),\n",
       " ('excpet', 2),\n",
       " ('imitated', 2),\n",
       " ('keiraknightley', 2),\n",
       " ('bupt', 2),\n",
       " ('taipeighff', 2),\n",
       " ('recite', 2),\n",
       " ('ilsasss', 2),\n",
       " ('semisweet', 2),\n",
       " ('jhonnydepp', 2),\n",
       " ('yanzi', 2),\n",
       " ('inifnite', 2),\n",
       " ('perky', 2),\n",
       " ('steaking', 2),\n",
       " ('huges', 2),\n",
       " ('pgoenix', 2),\n",
       " ('mclaine', 2),\n",
       " ('reliving', 2),\n",
       " ('littlepunk', 2),\n",
       " ('iayhjajmo', 2),\n",
       " ('christinaricci', 2),\n",
       " ('yester', 2),\n",
       " ('notthing', 2),\n",
       " ('bohringer', 2),\n",
       " ('mignone', 2),\n",
       " ('colsure', 2),\n",
       " ('underestimate', 2),\n",
       " ('arguably', 2),\n",
       " ('zdt', 2),\n",
       " ('wetwetwet', 2),\n",
       " ('harge', 2),\n",
       " ('xmtgwnzyzntaw', 2),\n",
       " ('opponeent', 2),\n",
       " ('labeouf', 2),\n",
       " ('onelegged', 2),\n",
       " ('mickeyrourke', 2),\n",
       " ('soooooo', 2),\n",
       " ('jackblack', 2),\n",
       " ('appetizing', 2),\n",
       " ('stupidy', 2),\n",
       " ('unfulfilled', 2),\n",
       " ('doesnt', 2),\n",
       " ('hakunamatata', 2),\n",
       " ('zkf', 2),\n",
       " ('muahaha', 2),\n",
       " ('toystory', 2),\n",
       " ('kongfu', 2),\n",
       " ('eega', 2),\n",
       " ('perte', 2),\n",
       " ('sandrabullock', 2),\n",
       " ('hsasnu', 2),\n",
       " ('bledel', 2),\n",
       " ('odis', 2),\n",
       " ('precursory', 2),\n",
       " ('assasinate', 2),\n",
       " ('normteam', 2),\n",
       " ('sdhf', 2),\n",
       " ('gansel', 2),\n",
       " ('sonoya', 2),\n",
       " ('schnetzer', 2),\n",
       " ('piratic', 2),\n",
       " ('nyff', 2),\n",
       " ('gerri', 2),\n",
       " ('blindly', 2),\n",
       " ('fuxk', 2),\n",
       " ('brrip', 2),\n",
       " ('haomei', 2),\n",
       " ('paced', 2),\n",
       " ('chigurh', 2),\n",
       " ('bangable', 2),\n",
       " ('motherf', 2),\n",
       " ('sooooooo', 2),\n",
       " ('sammo', 2),\n",
       " ('operatio', 2),\n",
       " ('dimensionfilms', 2),\n",
       " ('arther', 2),\n",
       " ('ralphfiennes', 2),\n",
       " ('colinfarrell', 2),\n",
       " ('sanderford', 2),\n",
       " ('valium', 2),\n",
       " ('rambooo', 2),\n",
       " ('vages', 2),\n",
       " ('badguys', 2),\n",
       " ('eastgame', 2),\n",
       " ('yooyoo', 2),\n",
       " ('mouses', 2),\n",
       " ('hapless', 2),\n",
       " ('opting', 2),\n",
       " ('keanureeves', 2),\n",
       " ('bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbest',\n",
       "  2),\n",
       " ('pekingpc', 2),\n",
       " ('perfact', 2),\n",
       " ('xdu', 2),\n",
       " ('awsome', 2),\n",
       " ('guessed', 2),\n",
       " ('niebaum', 2),\n",
       " ('somethings', 2),\n",
       " ('stuffs', 2),\n",
       " ('hahahahaha', 2),\n",
       " ('jagermeister', 2),\n",
       " ('schultzzzzzzzzz', 2),\n",
       " ('noooooo', 2),\n",
       " ('trashy', 2),\n",
       " ('hahahhhh', 2),\n",
       " ('coincident', 2),\n",
       " ('characterization', 2),\n",
       " ('secretely', 2),\n",
       " ('gopsall', 2),\n",
       " ('irritating', 2),\n",
       " ('oumei', 2),\n",
       " ('weallmc', 2),\n",
       " ('lucbesson', 2),\n",
       " ('blablablabla', 2),\n",
       " ('giid', 2),\n",
       " ('goodending', 2),\n",
       " ('cuter', 2),\n",
       " ('stanson', 2),\n",
       " ('stanthan', 2),\n",
       " ('danielcraig', 2),\n",
       " ('kayxu', 2),\n",
       " ('mabrey', 2),\n",
       " ('tonyjaa', 2),\n",
       " ('noddle', 2),\n",
       " ('smtmpwy', 2),\n",
       " ('oxybot', 2),\n",
       " ('qnmlgb', 2),\n",
       " ('maay', 2),\n",
       " ('illutionist', 2),\n",
       " ('smthing', 2),\n",
       " ('evrey', 2),\n",
       " ('pys', 2),\n",
       " ('gongroo', 2),\n",
       " ('prprprprprprprprprprprprprprprprprprprprpr', 2),\n",
       " ('qqaqq', 2),\n",
       " ('wwwwwwwww', 2),\n",
       " ('shipin', 2),\n",
       " ('biood', 2),\n",
       " ('wanwa', 2),\n",
       " ('xmjqwndgwmjiw', 2),\n",
       " ('babyangle', 2),\n",
       " ('zhuangbility', 2),\n",
       " ('mlgbd', 2),\n",
       " ('emmmmmmmmmmm', 2),\n",
       " ('weibotime', 2),\n",
       " ('ellenpage', 2),\n",
       " ('javu', 2),\n",
       " ('enthralled', 2),\n",
       " ('seizes', 2),\n",
       " ('bfsu', 2),\n",
       " ('cinefile', 2),\n",
       " ('poignant', 2),\n",
       " ('dissapointing', 2),\n",
       " ('gontiti', 2),\n",
       " ('fuckurself', 2),\n",
       " ('jenning', 2),\n",
       " ('especiall', 2),\n",
       " ('gaypower', 2),\n",
       " ('seanpenn', 2),\n",
       " ('gvs', 2),\n",
       " ('jasonreitman', 2),\n",
       " ('banters', 2),\n",
       " ('colinfirth', 2),\n",
       " ('armchannel', 2),\n",
       " ('jenniferaniston', 2),\n",
       " ('fransokyo', 2),\n",
       " ('bacause', 2),\n",
       " ('distracting', 2),\n",
       " ('firday', 2),\n",
       " ('lilycollins', 2),\n",
       " ('onekill', 2),\n",
       " ('oneshoot', 2),\n",
       " ('futuer', 2),\n",
       " ('bellick', 2),\n",
       " ('undervalue', 2),\n",
       " ('agsokrdt', 2),\n",
       " ('dreamning', 2),\n",
       " ('aswgunvfwds', 2),\n",
       " ('finit', 2),\n",
       " ('fearing', 2),\n",
       " ('amazingly', 2),\n",
       " ('blondy', 2),\n",
       " ('haahaaaaa', 2),\n",
       " ('hgwxx', 2),\n",
       " ('crappy', 2),\n",
       " ('lmao', 2),\n",
       " ('bradleycooper', 2),\n",
       " ('bamf', 2),\n",
       " ('gooey', 2),\n",
       " ('observered', 2),\n",
       " ('smartassility', 2),\n",
       " ('kiangkiang', 2),\n",
       " ('riveting', 2),\n",
       " ('tfboy', 2),\n",
       " ('hxe', 2),\n",
       " ('rxq', 2),\n",
       " ('wwwwwwww', 2),\n",
       " ('pujing', 2),\n",
       " ('rarbg', 2),\n",
       " ('porra', 2),\n",
       " ('uubird', 2),\n",
       " ('freeeeeeeeeeeeeeedom', 2),\n",
       " ('balabalabala', 2),\n",
       " ('macwebb', 2),\n",
       " ('buberry', 2),\n",
       " ('maryjane', 2),\n",
       " ('nuanced', 2),\n",
       " ('arguablly', 2),\n",
       " ('hohohohoho', 2),\n",
       " ('sleight', 2),\n",
       " ('psychologically', 2),\n",
       " ('thunderous', 2),\n",
       " ('hahha', 2),\n",
       " ('goodluck', 2),\n",
       " ('donnellys', 2),\n",
       " ('coorditation', 2),\n",
       " ('sjb', 2),\n",
       " ('byye', 2),\n",
       " ('jackchan', 2),\n",
       " ('stantham', 2),\n",
       " ('agerar', 2),\n",
       " ('trippy', 2),\n",
       " ('vplay', 2),\n",
       " ('tokuten', 2),\n",
       " ('eizou', 2),\n",
       " ('haokan', 2),\n",
       " ('endding', 2),\n",
       " ('taimu', 2),\n",
       " ('huwahuwa', 2),\n",
       " ('fatesn', 2),\n",
       " ('marticulated', 2),\n",
       " ('panyu', 2),\n",
       " ('fanyang', 2),\n",
       " ('seehd', 2),\n",
       " ('wierdo', 2),\n",
       " ('newscan', 2),\n",
       " ('magcont', 2),\n",
       " ('hdwing', 2),\n",
       " ('cuish', 2),\n",
       " ('victo', 2),\n",
       " ('mengmeng', 2),\n",
       " ('pewpewpew', 2),\n",
       " ('krennic', 2),\n",
       " ('exhausting', 2),\n",
       " ('cyriak', 2),\n",
       " ('biubiu', 2),\n",
       " ('yondu', 2),\n",
       " ('jlaw', 2),\n",
       " ('wwwwww', 2),\n",
       " ('bbbbbbbbbest', 2),\n",
       " ('rightest', 2),\n",
       " ('grandmasters', 2),\n",
       " ('wakanda', 2),\n",
       " ('spidey', 2),\n",
       " ('mattdamon', 2),\n",
       " ('duangduang', 2),\n",
       " ('ifree', 2),\n",
       " ('cineworld', 2),\n",
       " ('santoshi', 2),\n",
       " ('woooooooooooooooooooooooooow', 2),\n",
       " ('drinkin', 2),\n",
       " ('fuckfuckfuck', 2),\n",
       " ('awar', 2),\n",
       " ('timtam', 2),\n",
       " ('xjb', 2),\n",
       " ('tundu', 2),\n",
       " ('ressource', 1),\n",
       " ('sultat', 1),\n",
       " ('rement', 1),\n",
       " ('excit', 1),\n",
       " ('marava', 1),\n",
       " ('rcxfzqu', 1),\n",
       " ('haleyjoelosment', 1),\n",
       " ('jarujaru', 1),\n",
       " ('klaatu', 1),\n",
       " ('woodstocks', 1),\n",
       " ('fuckyeah', 1),\n",
       " ('coooooool', 1),\n",
       " ('herdel', 1),\n",
       " ('kankan', 1),\n",
       " ('palindrome', 1),\n",
       " ('intresting', 1),\n",
       " ('hautefeuille', 1),\n",
       " ('pialat', 1),\n",
       " ('psychd', 1),\n",
       " ('gnmgd', 1),\n",
       " ('carros', 1),\n",
       " ('spontaneously', 1),\n",
       " ('terrifically', 1),\n",
       " ('analia', 1),\n",
       " ('oorlogwinter', 1),\n",
       " ('oorlog', 1),\n",
       " ('hongerwinter', 1),\n",
       " ('onetti', 1),\n",
       " ('onomichi', 1),\n",
       " ('wrinkles', 1),\n",
       " ('excentris', 1),\n",
       " ('pilogue', 1),\n",
       " ('sentimentale', 1),\n",
       " ('parfaite', 1),\n",
       " ('samours', 1),\n",
       " ('saisissante', 1),\n",
       " ('limpidit', 1),\n",
       " ('contrepied', 1),\n",
       " ('filmique', 1),\n",
       " ('dispositif', 1),\n",
       " ('plasticit', 1),\n",
       " ('riorit', 1),\n",
       " ('efficace', 1),\n",
       " ('outil', 1),\n",
       " ('poing', 1),\n",
       " ('aiguill', 1),\n",
       " ('arroi', 1),\n",
       " ('morial', 1),\n",
       " ('thibaudeau', 1),\n",
       " ('issus', 1),\n",
       " ('pornographiques', 1),\n",
       " ('bordel', 1),\n",
       " ('grannie', 1),\n",
       " ('spettacolo', 1),\n",
       " ('rasing', 1),\n",
       " ('liangbin', 1),\n",
       " ('tereska', 1),\n",
       " ('xmtgwodaymdky', 1),\n",
       " ('parlo', 1),\n",
       " ('surrealistic', 1),\n",
       " ('freakishly', 1),\n",
       " ('hantise', 1),\n",
       " ('scapes', 1),\n",
       " ('germer', 1),\n",
       " ('poignante', 1),\n",
       " ('recul', 1),\n",
       " ('lancolis', 1),\n",
       " ('souffrante', 1),\n",
       " ('umberro', 1),\n",
       " ('eclisse', 1),\n",
       " ('monicavitti', 1),\n",
       " ('urusevskiy', 1),\n",
       " ('teared', 1),\n",
       " ('zhuangde', 1),\n",
       " ('dephine', 1),\n",
       " ('attendez', 1),\n",
       " ('souhaite', 1),\n",
       " ('rosine', 1),\n",
       " ('enterthemirror', 1),\n",
       " ('zgyspp', 1),\n",
       " ('noastra', 1),\n",
       " ('gengjun', 1),\n",
       " ('ciff', 1),\n",
       " ('gavino', 1),\n",
       " ('weakens', 1),\n",
       " ('directios', 1),\n",
       " ('brancuzsk', 1),\n",
       " ('boudov', 1),\n",
       " ('birkov', 1),\n",
       " ('altmanov', 1),\n",
       " ('kolja', 1),\n",
       " ('apeoct', 1),\n",
       " ('wuwuwu', 1),\n",
       " ('boutons', 1),\n",
       " ('superbly', 1),\n",
       " ('ohmygod', 1),\n",
       " ('xiaoqingxin', 1),\n",
       " ('iseya', 1),\n",
       " ('ojzzz', 1),\n",
       " ('kyotocinema', 1),\n",
       " ('delerue', 1),\n",
       " ('rly', 1),\n",
       " ('thats', 1),\n",
       " ('duranduran', 1),\n",
       " ('gogogogo', 1),\n",
       " ('drunby', 1),\n",
       " ('kissass', 1),\n",
       " ('ypp', 1),\n",
       " ('blx', 1),\n",
       " ('puler', 1),\n",
       " ('ehnpnvow', 1),\n",
       " ('gakki', 1),\n",
       " ('maibenlaikub', 1),\n",
       " ('bukaopu', 1),\n",
       " ('tork', 1),\n",
       " ('dommage', 1),\n",
       " ('jayz', 1),\n",
       " ('debick', 1),\n",
       " ('overcook', 1),\n",
       " ('amcatlantic', 1),\n",
       " ('ofen', 1),\n",
       " ('uqhq', 1),\n",
       " ('fiben', 1),\n",
       " ('sealsniperschool', 1),\n",
       " ('usairways', 1),\n",
       " ('feesee', 1),\n",
       " ('dianying', 1),\n",
       " ('daying', 1),\n",
       " ('merde', 1),\n",
       " ('mournful', 1),\n",
       " ('eurotic', 1),\n",
       " ('serait', 1),\n",
       " ('crucrucru', 1),\n",
       " ('screwball', 1),\n",
       " ('iloop', 1),\n",
       " ('hahhha', 1),\n",
       " ('kavkalu', 1),\n",
       " ('condescending', 1),\n",
       " ('folp', 1),\n",
       " ('violetera', 1),\n",
       " ('sleeplessseattle', 1),\n",
       " ('tolstory', 1),\n",
       " ('bttiantang', 1),\n",
       " ('dovima', 1),\n",
       " ('parda', 1),\n",
       " ('ozymandius', 1),\n",
       " ('indulgent', 1),\n",
       " ('autobiographical', 1),\n",
       " ('unsentimental', 1),\n",
       " ('miracular', 1),\n",
       " ('romanticising', 1),\n",
       " ('dianekeaton', 1),\n",
       " ('miafarrow', 1),\n",
       " ('riffifi', 1),\n",
       " ('mocumentary', 1),\n",
       " ('yolfilm', 1),\n",
       " ('evne', 1),\n",
       " ('transcendant', 1),\n",
       " ('imperceptible', 1),\n",
       " ('twerp', 1),\n",
       " ('snot', 1),\n",
       " ('pompous', 1),\n",
       " ('wipers', 1),\n",
       " ('brained', 1),\n",
       " ('unclog', 1),\n",
       " ('debaser', 1),\n",
       " ('unladen', 1),\n",
       " ('sarandan', 1),\n",
       " ('hehehehehehehe', 1),\n",
       " ('xqlunk', 1),\n",
       " ('duib', 1),\n",
       " ('goddamned', 1),\n",
       " ('hundsucker', 1),\n",
       " ('vilification', 1),\n",
       " ('hanstyle', 1),\n",
       " ('whaaaaaaaaat', 1),\n",
       " ('absurdity', 1),\n",
       " ('siuc', 1),\n",
       " ('goooood', 1),\n",
       " ('ohhh', 1),\n",
       " ('shipper', 1),\n",
       " ('heyyou', 1),\n",
       " ('shakesbeard', 1),\n",
       " ('convurl', 1),\n",
       " ('grase', 1),\n",
       " ('maggiesmith', 1),\n",
       " ('hahahahhaaha', 1),\n",
       " ('beyong', 1),\n",
       " ('fyp', 1),\n",
       " ('svernal', 1),\n",
       " ('ladiesman', 1),\n",
       " ('buggs', 1),\n",
       " ('fansmovie', 1),\n",
       " ('uncult', 1),\n",
       " ('foward', 1),\n",
       " ('waak', 1),\n",
       " ('samdaniel', 1),\n",
       " ('xmjazmtkyoti', 1),\n",
       " ('danieldl', 1),\n",
       " ('dicapreo', 1),\n",
       " ('briandepalma', 1),\n",
       " ('outdate', 1),\n",
       " ('roberdeniro', 1),\n",
       " ('aequitas', 1),\n",
       " ('cocksucking', 1),\n",
       " ('cymande', 1),\n",
       " ('passg', 1),\n",
       " ('nykvist', 1),\n",
       " ('ntkxmzmzmdg', 1),\n",
       " ('observant', 1),\n",
       " ('guilted', 1),\n",
       " ('hedonistic', 1),\n",
       " ('fonfon', 1),\n",
       " ('bregovic', 1),\n",
       " ('rennt', 1),\n",
       " ('rayfile', 1),\n",
       " ('vivling', 1),\n",
       " ('annoys', 1),\n",
       " ('ciaux', 1),\n",
       " ('effets', 1),\n",
       " ('gothiquement', 1),\n",
       " ('visuel', 1),\n",
       " ('anecdote', 1),\n",
       " ('manifestement', 1),\n",
       " ('matographique', 1),\n",
       " ('satyajitray', 1),\n",
       " ('unlikable', 1),\n",
       " ('totalarian', 1),\n",
       " ('captalism', 1),\n",
       " ('taimanov', 1),\n",
       " ('viginia', 1),\n",
       " ('furyprep', 1),\n",
       " ('guarantees', 1),\n",
       " ('fareast', 1),\n",
       " ('kurusawa', 1),\n",
       " ('ritrovato', 1),\n",
       " ('arlecchino', 1),\n",
       " ('cinefan', 1),\n",
       " ('authentique', 1),\n",
       " ('amazoning', 1),\n",
       " ('superimposition', 1),\n",
       " ('kikyo', 1),\n",
       " ('medoc', 1),\n",
       " ('hustel', 1),\n",
       " ('pezktgsjuni', 1),\n",
       " ('pltxnxrwihcx', 1),\n",
       " ('blixa', 1),\n",
       " ('carvinal', 1),\n",
       " ('endormant', 1),\n",
       " ('rythme', 1),\n",
       " ('visibilit', 1),\n",
       " ('incontestable', 1),\n",
       " ('pleinement', 1),\n",
       " ('contribuent', 1),\n",
       " ('lucidit', 1),\n",
       " ('impeccable', 1),\n",
       " ('brendo', 1),\n",
       " ('champoo', 1),\n",
       " ('frod', 1),\n",
       " ('dvdivx', 1),\n",
       " ('vigi', 1),\n",
       " ('williamwyler', 1),\n",
       " ('scarred', 1),\n",
       " ('obstructions', 1),\n",
       " ('undertone', 1),\n",
       " ('sensitively', 1),\n",
       " ('paean', 1),\n",
       " ('fetched', 1),\n",
       " ('rauger', 1),\n",
       " ('thisisjons', 1),\n",
       " ('veci', 1),\n",
       " ('alaundo', 1),\n",
       " ('johnwayne', 1),\n",
       " ('moriconne', 1),\n",
       " ('reassuring', 1),\n",
       " ('sophiscated', 1),\n",
       " ('lighthearted', 1),\n",
       " ('poaul', 1),\n",
       " ('mcqeen', 1),\n",
       " ('histc', 1),\n",
       " ('differece', 1),\n",
       " ('workmanship', 1),\n",
       " ('signifying', 1),\n",
       " ('tlfminisd', 1),\n",
       " ('allocine', 1),\n",
       " ('haoxiao', 1),\n",
       " ('juue', 1),\n",
       " ('enen', 1),\n",
       " ('lannnnnnnnnnnnnnnnnnnnnn', 1),\n",
       " ('cusine', 1),\n",
       " ('beautifull', 1),\n",
       " ('tosar', 1),\n",
       " ('mariocasas', 1),\n",
       " ('donnerstag', 1),\n",
       " ('dienstag', 1),\n",
       " ('aggy', 1),\n",
       " ('dafuq', 1),\n",
       " ('souze', 1),\n",
       " ('chdweb', 1),\n",
       " ('zenzen', 1),\n",
       " ('contradicci', 1),\n",
       " ('samob', 1),\n",
       " ('randyorton', 1),\n",
       " ('subbtt', 1),\n",
       " ('uberuaga', 1),\n",
       " ('thone', 1),\n",
       " ('lalilali', 1),\n",
       " ('vinne', 1),\n",
       " ('morde', 1),\n",
       " ('aviadu', 1),\n",
       " ('dreamlover', 1),\n",
       " ('cerina', 1),\n",
       " ('kneeguo', 1),\n",
       " ('hanni', 1),\n",
       " ('eternit', 1),\n",
       " ('maternit', 1),\n",
       " ('fuckingboring', 1),\n",
       " ('toudou', 1),\n",
       " ('rmpr', 1),\n",
       " ('jsjs', 1),\n",
       " ('kickass', 1),\n",
       " ('hdchina', 1),\n",
       " ('messed', 1),\n",
       " ('hhhhhhhhhhhh', 1),\n",
       " ('jonal', 1),\n",
       " ('johal', 1),\n",
       " ('instgram', 1),\n",
       " ('timallen', 1),\n",
       " ('marshell', 1),\n",
       " ('graet', 1),\n",
       " ('traim', 1),\n",
       " ...]"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "unknown_string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['dvd', 'rip']\n",
      "['bdr', 'ip']\n",
      "['cca', 'v']\n",
      "['bji', 'ff']\n",
      "['hitch', 'its', 'ch']\n",
      "['bti', 'h']\n",
      "['johnny', 'depp']\n",
      "['jl', 'o']\n",
      "['xxo', 'o']\n",
      "['nn', 'd']\n"
     ]
    }
   ],
   "source": [
    "for w,_ in unknown_string[:10]:\n",
    "    print(cut(w))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### 其它"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('50min', 2),\n",
       " ('😛', 2),\n",
       " ('ao3', 1),\n",
       " ('25am', 1),\n",
       " ('1dfd74a56054b289a4a244630d7ed1e70687f21b', 2),\n",
       " ('d6b6d9084ddc8542b188f6681d1c085f51dfb7e1', 1),\n",
       " ('1280187535', 1),\n",
       " ('📘', 3),\n",
       " ('123', 14),\n",
       " ('ac888256', 1),\n",
       " ('718', 1),\n",
       " ('d3', 3),\n",
       " ('50pm', 2),\n",
       " ('12cinema', 1),\n",
       " ('20070629', 2),\n",
       " ('180g', 1),\n",
       " ('0622', 1),\n",
       " ('ucta7et0ric', 1),\n",
       " ('478', 1),\n",
       " ('20150426', 1),\n",
       " ('20170513', 1),\n",
       " ('〒', 6),\n",
       " ('219', 1),\n",
       " ('1300', 5),\n",
       " ('╥', 10),\n",
       " ('20101024', 2),\n",
       " ('432', 2),\n",
       " ('3dcg', 1),\n",
       " ('👲', 4),\n",
       " ('1500w', 2),\n",
       " ('5788', 1),\n",
       " ('９', 6),\n",
       " ('5790', 1),\n",
       " ('2012sep3', 1),\n",
       " ('3018307', 2),\n",
       " ('20120821', 1),\n",
       " ('20100818', 1),\n",
       " ('25308321', 1),\n",
       " ('t13329271', 1),\n",
       " ('38022383', 1),\n",
       " ('8133320', 1),\n",
       " ('ntk2mdu0mty', 1),\n",
       " ('゜', 7),\n",
       " ('f7', 1),\n",
       " ('20150412', 2),\n",
       " ('283013556', 1),\n",
       " ('60', 223),\n",
       " ('tlf624', 2),\n",
       " ('159', 4),\n",
       " ('6200', 1),\n",
       " ('7540318', 1),\n",
       " ('tt0141926', 1),\n",
       " ('40min', 2),\n",
       " ('0sb4qvqvu', 1),\n",
       " ('2100', 1),\n",
       " ('1i510mfj', 1),\n",
       " ('3h', 2),\n",
       " ('602f80db0100e5eh', 2),\n",
       " ('571', 10),\n",
       " ('20131127imax', 1),\n",
       " ('36minq', 1),\n",
       " ('l041w', 1),\n",
       " ('yctwpr6b7ge', 1),\n",
       " ('war3', 2),\n",
       " ('01', 104),\n",
       " ('1266', 1),\n",
       " ('¬', 14),\n",
       " ('521', 1),\n",
       " ('dvd9', 1),\n",
       " ('─', 29),\n",
       " ('q0q', 1),\n",
       " ('184', 2),\n",
       " ('a731364e7b392af2f5b7bc897328cba450f9f95d', 2),\n",
       " ('3792', 1),\n",
       " ('u6lk64s0pw8', 1),\n",
       " ('23a', 1),\n",
       " ('512', 1),\n",
       " ('🎵', 2),\n",
       " ('dota2', 1),\n",
       " ('̐', 1),\n",
       " ('👋', 3),\n",
       " ('2169', 2),\n",
       " ('61', 4),\n",
       " ('201707152300', 1),\n",
       " ('🐲', 1),\n",
       " ('058', 1),\n",
       " ('8925', 2),\n",
       " ('\\\\', 146),\n",
       " ('😔', 14),\n",
       " ('dorma95', 1),\n",
       " ('5535', 1),\n",
       " ('100', 340),\n",
       " ('153561485', 3),\n",
       " ('1152', 2),\n",
       " ('2006', 101),\n",
       " ('̻', 1),\n",
       " ('5star', 3),\n",
       " ('5lbeyv5pz6qg', 2),\n",
       " ('300km', 1),\n",
       " ('5a', 1),\n",
       " ('20140313', 1),\n",
       " ('≠', 9),\n",
       " ('109', 4),\n",
       " ('130', 25),\n",
       " ('9000', 3),\n",
       " ('2095303657', 1),\n",
       " ('166', 1),\n",
       " ('20140517', 1),\n",
       " ('286', 1),\n",
       " ('xmjizmtk0mtaw', 2),\n",
       " ('tt1456661', 1),\n",
       " ('20160608', 1),\n",
       " ('1180', 1),\n",
       " ('7494126', 2),\n",
       " ('s1', 4),\n",
       " ('384k', 2),\n",
       " ('1119', 1),\n",
       " ('1976', 3),\n",
       " ('4004', 1),\n",
       " ('mr58', 1),\n",
       " ('2916', 2),\n",
       " ('⁄', 40),\n",
       " ('no1', 10),\n",
       " ('5dg', 1),\n",
       " ('4953496', 1),\n",
       " ('siff16', 1),\n",
       " ('5d', 6),\n",
       " ('9aa786a606eee1f6a26c3411a67731de2dd7c877', 2),\n",
       " ('🐷', 1),\n",
       " ('✧', 10),\n",
       " ('24with', 1),\n",
       " ('95147', 2),\n",
       " ('koda40', 1),\n",
       " ('st7', 1),\n",
       " ('20100718', 1),\n",
       " ('″', 2),\n",
       " ('20151219', 1),\n",
       " ('20160718', 1),\n",
       " ('f83a4be12a', 1),\n",
       " ('1222', 1),\n",
       " ('1776', 1),\n",
       " ('1233461', 1),\n",
       " ('tt0202383', 1),\n",
       " ('914', 1),\n",
       " ('🐔', 1),\n",
       " ('4s', 1),\n",
       " ('1701', 1),\n",
       " ('u14', 1),\n",
       " ('20130227', 1),\n",
       " ('≤', 4),\n",
       " ('010208', 2),\n",
       " ('page47', 1),\n",
       " ('08', 144),\n",
       " ('av276985', 1),\n",
       " ('ep4', 2),\n",
       " ('triplex2', 1),\n",
       " ('1o81xo6m', 1),\n",
       " ('2yfp', 1),\n",
       " ('4a56fcqkdyq2', 1),\n",
       " ('9136d0b0', 1),\n",
       " ('195', 2),\n",
       " ('20130908', 1),\n",
       " ('162min', 2),\n",
       " ('∠', 54),\n",
       " ('1gdoxavh', 1),\n",
       " ('9pm', 1),\n",
       " ('74', 9),\n",
       " ('228', 6),\n",
       " ('20150121', 1),\n",
       " ('siff17', 1),\n",
       " ('0716', 1),\n",
       " ('20130805jtl', 1),\n",
       " ('20100316', 1),\n",
       " ('u47', 1),\n",
       " ('53rd', 1),\n",
       " ('jtrqagludqd95k5s5ce', 1),\n",
       " ('888', 2),\n",
       " ('ep29', 1),\n",
       " ('６', 5),\n",
       " ('1889997185', 1),\n",
       " ('20160914', 1),\n",
       " ('170427', 1),\n",
       " ('33333', 1),\n",
       " ('20080306', 1),\n",
       " ('10261119', 1),\n",
       " ('u84', 1),\n",
       " ('bjiff2017', 2),\n",
       " ('1961', 4),\n",
       " ('000', 8),\n",
       " ('mje2mtcxotc', 1),\n",
       " ('od7w4k8ksgo', 1),\n",
       " ('120101', 1),\n",
       " ('170', 5),\n",
       " ('9286550cm', 1),\n",
       " ('20090705', 1),\n",
       " ('4700', 1),\n",
       " ('20100812', 1),\n",
       " ('mp5', 3),\n",
       " ('402', 3),\n",
       " ('613', 1),\n",
       " ('20110902', 1),\n",
       " ('\\ue412', 1),\n",
       " ('👎', 4),\n",
       " ('￣', 203),\n",
       " ('💆', 1),\n",
       " ('1948', 3),\n",
       " ('ccav8', 2),\n",
       " ('2626', 1),\n",
       " ('330', 1),\n",
       " ('235', 1),\n",
       " ('1994', 29),\n",
       " ('xmjy2mzawnzi4', 1),\n",
       " ('4mqbrcj3k3c', 1),\n",
       " ('5044818', 1),\n",
       " ('0001', 1),\n",
       " ('4444', 1),\n",
       " ('711', 1),\n",
       " ('ͤ', 1),\n",
       " ('20150320', 1),\n",
       " ('🔐', 1),\n",
       " ('2184k', 1),\n",
       " ('᷅', 7),\n",
       " ('8633', 1),\n",
       " ('19siff', 1),\n",
       " ('24k', 3),\n",
       " ('2019k', 2),\n",
       " ('ova2', 1),\n",
       " ('nquapkbmks8', 1),\n",
       " ('3deprive', 1),\n",
       " ('2012dec29', 2),\n",
       " ('005', 1),\n",
       " ('rz0brdy', 1),\n",
       " ('밌', 3),\n",
       " ('ฅ', 6),\n",
       " ('17348', 1),\n",
       " ('20130119', 2),\n",
       " ('531', 1),\n",
       " ('1868', 1),\n",
       " ('≪', 9),\n",
       " ('120212', 1),\n",
       " ('97db28f635ac65285bac7790ca7a36ca2234679201', 1),\n",
       " ('🇭', 2),\n",
       " ('d9dea7ecb35d8b2d1c4b9141f7d8a18a4f24c559', 2),\n",
       " ('k2', 7),\n",
       " ('round1', 2),\n",
       " ('502', 4),\n",
       " ('8067', 1),\n",
       " ('25725744', 2),\n",
       " ('867m', 1),\n",
       " ('p1sfzvq6ebe', 1),\n",
       " ('20120314', 1),\n",
       " ('194', 5),\n",
       " ('tt0087578', 1),\n",
       " ('biff2017', 1),\n",
       " ('tv2', 1),\n",
       " ('⭕', 1),\n",
       " ('60s', 6),\n",
       " ('2015siff', 9),\n",
       " ('．', 178),\n",
       " ('20080511', 1),\n",
       " ('x5', 3),\n",
       " ('517', 1),\n",
       " ('659', 1),\n",
       " ('55945', 1),\n",
       " ('h2', 2),\n",
       " ('201203', 2),\n",
       " ('8afd', 1),\n",
       " ('39415945', 1),\n",
       " ('2308ec70', 2),\n",
       " ('201201', 1),\n",
       " ('39', 23),\n",
       " ('121415', 2),\n",
       " ('183', 2),\n",
       " ('2a', 1),\n",
       " ('02', 94),\n",
       " ('180cm', 1),\n",
       " ('t5af9c675b', 2),\n",
       " ('139', 3),\n",
       " ('2b', 138),\n",
       " ('765', 10),\n",
       " ('081209', 1),\n",
       " ('pm13', 1),\n",
       " ('ᗜ', 2),\n",
       " ('tt0105839', 1),\n",
       " ('18th', 1),\n",
       " ('aq1vn9rm', 1),\n",
       " ('24', 177),\n",
       " ('́', 16),\n",
       " ('250507', 1),\n",
       " ('20170321', 1),\n",
       " ('s11', 1),\n",
       " ('😜', 5),\n",
       " ('6th', 1),\n",
       " ('🙊', 3),\n",
       " ('86m', 1),\n",
       " ('18', 228),\n",
       " ('ntm2njk0njq', 1),\n",
       " ('part1', 1),\n",
       " ('first2017', 2),\n",
       " ('102559025', 1),\n",
       " ('aqu5gm2x', 1),\n",
       " ('1944194', 1),\n",
       " ('t800', 4),\n",
       " ('top250', 8),\n",
       " ('khwii782hce', 1),\n",
       " ('7j67gcfbais', 1),\n",
       " ('080404sc', 1),\n",
       " ('103', 5),\n",
       " ('023', 1),\n",
       " ('252', 3),\n",
       " ('☹', 2),\n",
       " ('mkv912', 1),\n",
       " ('1315', 1),\n",
       " ('03', 126),\n",
       " ('20110407', 1),\n",
       " ('d645', 1),\n",
       " ('04', 126),\n",
       " ('20120413', 2),\n",
       " ('0406', 5),\n",
       " ('u2be', 1),\n",
       " ('20110915', 2),\n",
       " ('090215', 1),\n",
       " ('860', 1),\n",
       " ('ken861222', 1),\n",
       " ('7200', 1),\n",
       " ('20091210', 2),\n",
       " ('4867495', 2),\n",
       " ('211', 1),\n",
       " ('9143', 1),\n",
       " ('1ge2qycv', 1),\n",
       " ('bwv1007', 1),\n",
       " ('2017103', 1),\n",
       " ('007', 322),\n",
       " ('20070211', 1),\n",
       " ('tt0357894', 1),\n",
       " ('17048', 1),\n",
       " ('230208', 1),\n",
       " ('up70', 1),\n",
       " ('100000', 1),\n",
       " ('sam7', 2),\n",
       " ('207min', 2),\n",
       " ('tt0163025', 1),\n",
       " ('2006nov', 1),\n",
       " ('125', 5),\n",
       " ('x3', 3),\n",
       " ('t6152bf03e', 2),\n",
       " ('894', 1),\n",
       " ('65min', 1),\n",
       " ('6881', 1),\n",
       " ('tt0057611', 1),\n",
       " ('115', 6),\n",
       " ('5831', 1),\n",
       " ('1398592', 1),\n",
       " ('legallyblonde2', 1),\n",
       " ('pl7', 1),\n",
       " ('tt5029608', 1),\n",
       " ('◜', 1),\n",
       " ('233333', 10),\n",
       " ('999', 6),\n",
       " ('83', 23),\n",
       " ('20090823', 2),\n",
       " ('5th', 2),\n",
       " ('r8', 2),\n",
       " ('71', 10),\n",
       " ('233333333333', 1),\n",
       " ('1311', 3),\n",
       " ('203', 2),\n",
       " ('1540', 1),\n",
       " ('20110613', 1),\n",
       " ('1430', 1),\n",
       " ('41479691', 2),\n",
       " ('120min', 1),\n",
       " ('ew9v1bjtu0e', 1),\n",
       " ('424', 1),\n",
       " ('kv2', 2),\n",
       " ('20120430', 1),\n",
       " ('293mins', 2),\n",
       " ('170cm', 1),\n",
       " ('imax3', 3),\n",
       " ('55iw', 1),\n",
       " ('4800', 4),\n",
       " ('5555555555555', 1),\n",
       " ('ᵌ', 1),\n",
       " ('204', 1),\n",
       " ('265', 2),\n",
       " ('1995', 29),\n",
       " ('080618', 1),\n",
       " ('0830', 1),\n",
       " ('❥', 1),\n",
       " ('9a', 6),\n",
       " ('081014', 1),\n",
       " ('20070707', 1),\n",
       " ('͕', 1),\n",
       " ('2017berlinale', 1),\n",
       " ('1910', 2),\n",
       " ('10086', 5),\n",
       " ('3105', 1),\n",
       " ('37g', 2),\n",
       " ('2723022', 2),\n",
       " ('😨', 1),\n",
       " ('52020', 1),\n",
       " ('ͩ', 2),\n",
       " ('81', 24),\n",
       " ('c2pq2n8i', 1),\n",
       " ('sultenfuss1972', 1),\n",
       " ('qikhib83qhmai', 1),\n",
       " ('450', 2),\n",
       " ('xntm1ota1ntc2', 1),\n",
       " ('av439373', 1),\n",
       " ('233333333ed', 1),\n",
       " ('1960s', 3),\n",
       " ('2032', 1),\n",
       " ('st8', 1),\n",
       " ('579', 1),\n",
       " ('siff4', 1),\n",
       " ('xodc1nty', 1),\n",
       " ('true2', 1),\n",
       " ('xmteznjq4ody4', 2),\n",
       " ('😖', 5),\n",
       " ('378', 1),\n",
       " ('🌪', 1),\n",
       " ('160608', 1),\n",
       " ('1980s', 4),\n",
       " ('23333', 38),\n",
       " ('cf2012', 1),\n",
       " ('1ge8bhen', 1),\n",
       " ('hou26', 1),\n",
       " ('bjiff6', 1),\n",
       " ('┮', 2),\n",
       " ('20080314', 1),\n",
       " ('﹣', 3),\n",
       " ('1774023189', 1),\n",
       " ('20111118', 2),\n",
       " ('lv1', 1),\n",
       " ('160820', 1),\n",
       " ('1464x1080', 2),\n",
       " ('‖', 2),\n",
       " ('249', 2),\n",
       " ('1953', 2),\n",
       " ('qdwren5jmkc', 2),\n",
       " ('3x', 1),\n",
       " ('aqu5gs1j', 1),\n",
       " ('009', 3),\n",
       " ('‘', 202),\n",
       " ('10cm', 2),\n",
       " ('ͧ', 1),\n",
       " ('2833', 1),\n",
       " ('1895', 1),\n",
       " ('22gb', 1),\n",
       " ('30s', 2),\n",
       " ('20110621', 1),\n",
       " ('1989', 20),\n",
       " ('20151011', 1),\n",
       " ('imdb248', 2),\n",
       " ('20170117', 1),\n",
       " ('376m', 1),\n",
       " ('46', 13),\n",
       " ('637145', 1),\n",
       " ('584m', 1),\n",
       " ('mzawoteyodm5ma', 1),\n",
       " ('◕', 4),\n",
       " ('◑', 1),\n",
       " ('2084', 1),\n",
       " ('98fans', 1),\n",
       " ('1944', 9),\n",
       " ('6000', 14),\n",
       " ('20110201', 2),\n",
       " ('4291588', 2),\n",
       " ('20071217', 1),\n",
       " ('3dimax', 7),\n",
       " ('2728265', 1),\n",
       " ('mi3', 1),\n",
       " ('s3ep2', 1),\n",
       " ('20170530', 2),\n",
       " ('248', 2),\n",
       " ('20151112', 1),\n",
       " ('−', 2),\n",
       " ('e1', 1),\n",
       " ('459', 1),\n",
       " ('6500', 1),\n",
       " ('batman3', 1),\n",
       " ('138', 8),\n",
       " ('605', 1),\n",
       " ('0806', 1),\n",
       " ('1956', 17),\n",
       " ('20120818', 2),\n",
       " ('1992', 13),\n",
       " ('20140124', 1),\n",
       " ('2012jan19', 1),\n",
       " ('534', 1),\n",
       " ('xnjq2njqwmdg', 1),\n",
       " ('ⓘ', 2),\n",
       " ('45', 80),\n",
       " ('mjm5oda1nzi2ma', 1),\n",
       " ('2015183', 1),\n",
       " ('jxtv4', 1),\n",
       " ('ac3', 22),\n",
       " ('rjzv3msojrw', 1),\n",
       " ('9with', 1),\n",
       " ('666', 10),\n",
       " ('st6', 1),\n",
       " ('203562338', 1),\n",
       " ('xntc3otu5nzi4', 1),\n",
       " ('84', 31),\n",
       " ('cny30', 1),\n",
       " ('20130809', 1),\n",
       " ('￼', 1),\n",
       " ('cc00xmtmynze2mty', 3),\n",
       " ('5starlightangel', 1),\n",
       " ('100616', 1),\n",
       " ('20150420', 1),\n",
       " ('18538836', 1),\n",
       " ('0728', 1),\n",
       " ('2d9', 2),\n",
       " ('78926', 1),\n",
       " ('525', 1),\n",
       " ('1022', 1),\n",
       " ('🐚', 1),\n",
       " ('20130412', 2),\n",
       " ('57462', 1),\n",
       " ('su27', 1),\n",
       " ('20110120', 1),\n",
       " ('55', 42),\n",
       " ('xmjq0mdgwndi0', 1),\n",
       " ('9e', 1),\n",
       " ('ȏ', 1),\n",
       " ('imax2d', 4),\n",
       " ('ps12', 1),\n",
       " ('138min', 1),\n",
       " ('20150110', 1),\n",
       " ('150106', 1),\n",
       " ('1448', 1),\n",
       " ('mo4', 1),\n",
       " ('l0djtcla', 1),\n",
       " ('k9c8cbdhnuue', 1),\n",
       " ('20101001', 1),\n",
       " ('20100905', 2),\n",
       " ('2013jan24', 1),\n",
       " ('eiff2014', 1),\n",
       " ('xndcyndgynjy0', 1),\n",
       " ('1520', 15),\n",
       " ('20120531', 1),\n",
       " ('101005', 1),\n",
       " ('06', 150),\n",
       " ('129', 4),\n",
       " ('930', 3),\n",
       " ('20120706', 2),\n",
       " ('20090205', 1),\n",
       " ('av688131', 1),\n",
       " ('䁖', 1),\n",
       " ('5w', 1),\n",
       " ('233333333', 3),\n",
       " ('😳', 16),\n",
       " ('20170408', 2),\n",
       " ('071206', 1),\n",
       " ('berlinale2014', 1),\n",
       " ('f1', 31),\n",
       " ('555', 25),\n",
       " ('688', 1),\n",
       " ('xnzewntm4odq', 2),\n",
       " ('↖', 4),\n",
       " ('😆', 9),\n",
       " ('20090502', 1),\n",
       " ('invisible2013', 2),\n",
       " ('✨', 1),\n",
       " ('imdb2', 2),\n",
       " ('200170364', 1),\n",
       " ('14min', 1),\n",
       " ('fm92', 1),\n",
       " ('💜', 1),\n",
       " ('1c0seve4', 1),\n",
       " ('̃', 1),\n",
       " ('20160922', 2),\n",
       " ('280min', 1),\n",
       " ('※', 2),\n",
       " ('178min', 1),\n",
       " ('͡', 77),\n",
       " ('20160403', 2),\n",
       " ('tc42', 1),\n",
       " ('cod6mw2', 2),\n",
       " ('3686', 1),\n",
       " ('x264', 76),\n",
       " ('̉', 1),\n",
       " ('®', 1),\n",
       " ('̥', 17),\n",
       " ('z7', 1),\n",
       " ('279', 1),\n",
       " ('2f', 8),\n",
       " ('︿', 18),\n",
       " ('┬', 10),\n",
       " ('1613856', 2),\n",
       " ('201406', 1),\n",
       " ('50kg', 1),\n",
       " ('bwv106', 2),\n",
       " ('654m', 1),\n",
       " ('47059', 1),\n",
       " ('2047', 1),\n",
       " ('xmta0nzc1njmy', 2),\n",
       " ('3948', 1),\n",
       " ('2l', 1),\n",
       " ('2k', 4),\n",
       " ('ac1878293', 1),\n",
       " ('30am', 2),\n",
       " ('nta0njy3mza', 2),\n",
       " ('20170412', 1),\n",
       " ('8910', 1),\n",
       " ('̂', 1),\n",
       " ('20110410', 2),\n",
       " ('20100923', 2),\n",
       " ('😣', 1),\n",
       " ('38g', 1),\n",
       " ('💮', 1),\n",
       " ('290204', 1),\n",
       " ('113', 2),\n",
       " ('c2c8923a', 1),\n",
       " ('20070917', 2),\n",
       " ('b7', 4),\n",
       " ('2107', 1),\n",
       " ('3604', 2),\n",
       " ('2ftrack', 1),\n",
       " ('20140331', 1),\n",
       " ('97190', 2),\n",
       " ('20121203', 1),\n",
       " ('∞', 3),\n",
       " ('▾', 1),\n",
       " ('tv0', 1),\n",
       " ('131', 3),\n",
       " ('tt0105534', 2),\n",
       " ('0826', 1),\n",
       " ('6013', 1),\n",
       " ('1492', 1),\n",
       " ('50s', 1),\n",
       " ('t1000', 6),\n",
       " ('͙', 1),\n",
       " ('27', 122),\n",
       " ('🎅', 2),\n",
       " ('s65', 1),\n",
       " ('401', 1),\n",
       " ('‸', 7),\n",
       " ('140430', 2),\n",
       " ('501', 13),\n",
       " ('2987567', 1),\n",
       " ('900', 1),\n",
       " ('͟', 8),\n",
       " ('20141120yl', 1),\n",
       " ('┛', 3),\n",
       " ('b13', 1),\n",
       " ('movie12258', 2),\n",
       " ('1993', 22),\n",
       " ('’', 233),\n",
       " ('js7', 1),\n",
       " ('🌂', 1),\n",
       " ('20081115', 1),\n",
       " ('2333322', 1),\n",
       " ('2904623', 1),\n",
       " ('2012', 260),\n",
       " ('f045c3234', 1),\n",
       " ('tv24', 1),\n",
       " ('ze0x6rsngqcnvaimpbjfubxdrrttrg', 1),\n",
       " ('1482', 2),\n",
       " ('405', 1),\n",
       " ('20160415', 2),\n",
       " ('20pm', 1),\n",
       " ('20120610', 1),\n",
       " ('⁻', 1),\n",
       " ('7f3467e5f6ba2b866c1ef7029a113db4c33311dc', 1),\n",
       " ('cctv6hd', 1),\n",
       " ('520', 6),\n",
       " ('991m', 2),\n",
       " ('0x', 2),\n",
       " ('✾', 1),\n",
       " ('html5', 3),\n",
       " ('4268', 2),\n",
       " ('1986', 8),\n",
       " ('cd1', 5),\n",
       " ('u31', 1),\n",
       " ('1500', 5),\n",
       " ('3221', 1),\n",
       " ('1874', 1),\n",
       " ('20150920', 1),\n",
       " ('149', 4),\n",
       " ('͑', 2),\n",
       " ('20060813', 1),\n",
       " ('20120809', 2),\n",
       " ('4291637', 2),\n",
       " ('5678910', 1),\n",
       " ('lff2015', 1),\n",
       " ('🎁', 1),\n",
       " ('186', 2),\n",
       " ('liar2014', 1),\n",
       " ('cc00xnd', 1),\n",
       " ('2014may17', 1),\n",
       " ('229318', 1),\n",
       " ('zc1000', 1),\n",
       " ('1400', 2),\n",
       " ('422', 3),\n",
       " ('5km', 1),\n",
       " ('b88c', 1),\n",
       " ('1900', 10),\n",
       " ('4142', 3),\n",
       " ('20110406', 2),\n",
       " ('14000', 1),\n",
       " ('05hbv', 1),\n",
       " ('2x', 4),\n",
       " ('5465215', 1),\n",
       " ('189cm', 1),\n",
       " ('7546055', 1),\n",
       " ('🌵', 1),\n",
       " ('😑', 4),\n",
       " ('razar910', 2),\n",
       " ('70th', 1),\n",
       " ('20120301', 1),\n",
       " ('3pm', 1),\n",
       " ('8mm', 4),\n",
       " ('090212', 2),\n",
       " ('5min', 7),\n",
       " ('2w', 1),\n",
       " ('◐', 1),\n",
       " ('0szruu', 1),\n",
       " ('4504374', 2),\n",
       " ('364', 1),\n",
       " ('ffwv8l', 1),\n",
       " ('6audio', 1),\n",
       " ('1997', 41),\n",
       " ('152', 2),\n",
       " ('196cm', 1),\n",
       " ('6202', 1),\n",
       " ('m47', 2),\n",
       " ('rqmegupqwp5h3ym', 1),\n",
       " ('1885', 1),\n",
       " ('9800', 1),\n",
       " ('2011jan6', 1),\n",
       " ('b5df', 1),\n",
       " ('201', 2),\n",
       " ('20dts', 2),\n",
       " ('｡', 38),\n",
       " ('t25f3e020d', 1),\n",
       " ('b8', 8),\n",
       " ('140', 27),\n",
       " ('a524', 1),\n",
       " ('48558', 1),\n",
       " ('3x120', 1),\n",
       " ('k2so', 1),\n",
       " ('110min', 1),\n",
       " ('20151224', 1),\n",
       " ('9c', 5),\n",
       " ('263', 1),\n",
       " ('tt0126839', 2),\n",
       " ('2168', 1),\n",
       " ('217', 2),\n",
       " ('f1682340o1p26', 1),\n",
       " ('🐑', 1),\n",
       " ('1wtc', 1),\n",
       " ('1996', 28),\n",
       " ('ac1511804', 1),\n",
       " ('120km', 2),\n",
       " ('188', 4),\n",
       " ('﹂', 1),\n",
       " ('1302644', 2),\n",
       " ('ca224', 1),\n",
       " ('tt0073629', 1),\n",
       " ('av3997994', 1),\n",
       " ('imdb250', 4),\n",
       " ('20170712', 1),\n",
       " ('20150211', 1),\n",
       " ('5285', 1),\n",
       " ('3k', 2),\n",
       " ('😀', 2),\n",
       " ('98g', 1),\n",
       " ('20170213', 1),\n",
       " ('d960', 1),\n",
       " ('331', 2),\n",
       " ('0531cgv', 1),\n",
       " ('b6tw7waa', 1),\n",
       " ('mk2', 3),\n",
       " ('437', 1),\n",
       " ('4913', 1),\n",
       " ('87', 30),\n",
       " ('926', 1),\n",
       " ('1073741859', 1),\n",
       " ('301989888', 1),\n",
       " ('1h20min', 1),\n",
       " ('5330', 1),\n",
       " ('80cm', 1),\n",
       " ('161013', 2),\n",
       " ('㉨', 3),\n",
       " ('141130', 1),\n",
       " ('1866', 2),\n",
       " ('av446069', 1),\n",
       " ('2013mar', 1),\n",
       " ('f03cf621bb', 1),\n",
       " ('16914', 2),\n",
       " ('1920x1080p', 1),\n",
       " ('mi6', 1),\n",
       " ('12jul', 1),\n",
       " ('201505', 2),\n",
       " ('1922', 1),\n",
       " ('🐝', 3),\n",
       " ('1972', 7),\n",
       " ('🤗', 1),\n",
       " ('ccav10', 6),\n",
       " ('g1', 1),\n",
       " ('42qu', 1),\n",
       " ('546', 1),\n",
       " ('20120522', 1),\n",
       " ('59', 17),\n",
       " ('̰', 2),\n",
       " ('∀', 16),\n",
       " ('ُ', 1),\n",
       " ('20110508', 1),\n",
       " ('20170227', 1),\n",
       " ('d959', 1),\n",
       " ('fool2', 1),\n",
       " ('285', 1),\n",
       " ('20170514', 1),\n",
       " ('20170414', 1),\n",
       " ('✋', 1),\n",
       " ('1946', 13),\n",
       " ('63', 17),\n",
       " ('20151020', 2),\n",
       " ('2413', 2),\n",
       " ('tt0277027', 1),\n",
       " ('xmzi5otg1odq', 1),\n",
       " ('1096', 1),\n",
       " ('av1834850', 1),\n",
       " ('909574', 1),\n",
       " ('1908', 1),\n",
       " ('080415', 1),\n",
       " ('547', 1),\n",
       " ('1800', 2),\n",
       " ('a4', 5),\n",
       " ('341', 1),\n",
       " ('20170608', 1),\n",
       " ('cj4hnm2hrlo', 2),\n",
       " ('8d', 4),\n",
       " ('20130421', 1),\n",
       " ('😒', 15),\n",
       " ('1247335620', 1),\n",
       " ('1342', 2),\n",
       " ('20s', 1),\n",
       " ('🙃', 13),\n",
       " ('😤', 1),\n",
       " ('u7yg096tzk5o', 1),\n",
       " ('n14528', 1),\n",
       " ('20081223', 2),\n",
       " ('222222222222222222222222222222', 1),\n",
       " ('555555', 8),\n",
       " ('288', 2),\n",
       " ('137', 6),\n",
       " ('309', 1),\n",
       " ('t5f61b4b82', 1),\n",
       " ('7654', 1),\n",
       " ('063', 1),\n",
       " ('4141', 1),\n",
       " ('102', 2),\n",
       " ('✌', 2),\n",
       " ('71a5640b2f22570da2243cd6711589fb', 1),\n",
       " ('2r', 1),\n",
       " ('66min', 1),\n",
       " ('🏃', 3),\n",
       " ('͚', 1),\n",
       " ('719', 1),\n",
       " ('11e0', 3),\n",
       " ('20070813', 2),\n",
       " ('17', 226),\n",
       " ('93008', 2),\n",
       " ('1224', 1),\n",
       " ('00m', 1),\n",
       " ('251f2c4a8392d2c4c73d7798d666c65690bfe1af', 1),\n",
       " ('👍', 61),\n",
       " ('2014', 242),\n",
       " ('m1', 2),\n",
       " ('tt0398286', 1),\n",
       " ('tt0116213', 1),\n",
       " ('1302383', 1),\n",
       " ('20100820', 1),\n",
       " ('5h', 1),\n",
       " ('3775', 1),\n",
       " ('av2319432', 1),\n",
       " ('1taiwan', 1),\n",
       " ('mp4ba', 2),\n",
       " ('777', 2),\n",
       " ('7442199', 1),\n",
       " ('355', 2),\n",
       " ('863', 1),\n",
       " ('0706cgv', 1),\n",
       " ('gp60', 1),\n",
       " ('1933', 2),\n",
       " ('20120523', 1),\n",
       " ('2cds', 1),\n",
       " ('top5', 7),\n",
       " ('253', 1),\n",
       " ('tt1590089', 1),\n",
       " ('bd056', 1),\n",
       " ('68', 23),\n",
       " ('st9', 1),\n",
       " ('e4', 14),\n",
       " ('12949', 1),\n",
       " ('☑', 1),\n",
       " ('siff07', 1),\n",
       " ('㧰', 1),\n",
       " ('🇨', 1),\n",
       " ('part5', 1),\n",
       " ('4477366', 1),\n",
       " ('3idiots', 1),\n",
       " ('1728044652', 1),\n",
       " ('34', 16),\n",
       " ('┻', 12),\n",
       " ('c5u7iyaufm', 1),\n",
       " ('cf0072010', 1),\n",
       " ('189', 2),\n",
       " ('100316', 1),\n",
       " ('iwyy2nvgmy', 1),\n",
       " ('1246043', 1),\n",
       " ('av2278807', 1),\n",
       " ('4qju1lyrpm4', 1),\n",
       " ('p3', 1),\n",
       " ('౪', 1),\n",
       " ('65535', 2),\n",
       " ('282', 1),\n",
       " ('100101', 1),\n",
       " ('336124468', 1),\n",
       " ('20070804', 1),\n",
       " ('◡', 4),\n",
       " ('485033595', 1),\n",
       " ('20min', 6),\n",
       " ('gblchd0kep4', 2),\n",
       " ('419', 3),\n",
       " ('30411', 1),\n",
       " ('ody2mta0mdm', 2),\n",
       " ('≡', 9),\n",
       " ('108', 16),\n",
       " ('pmyx2015', 1),\n",
       " ('douban250', 2),\n",
       " ('b0', 2),\n",
       " ('c3po', 1),\n",
       " ('20160424', 1),\n",
       " ('20170526', 1),\n",
       " ('67', 14),\n",
       " ('575', 1),\n",
       " ('cqmrxz8b8', 1),\n",
       " ('0115', 1),\n",
       " ('a1da', 1),\n",
       " ('1030', 1),\n",
       " ('f9f93311c4', 1),\n",
       " ('150202', 1),\n",
       " ('➳', 1),\n",
       " ('5800', 1),\n",
       " ('\\ue41f', 3),\n",
       " ('ww4', 1),\n",
       " ('📓', 1),\n",
       " ('20070721', 1),\n",
       " ('28655m', 1),\n",
       " ('h8', 1),\n",
       " ('3834', 1),\n",
       " ('169', 4),\n",
       " ('ku6', 2),\n",
       " ('6271', 1),\n",
       " ('̮', 1),\n",
       " ('4022', 1),\n",
       " ('10000', 6),\n",
       " ('ww1', 1),\n",
       " ('120fps', 1),\n",
       " ('onizuka47', 1),\n",
       " ('392', 2),\n",
       " ('f82708b1de47a24adf0f499c', 1),\n",
       " ('3900', 1),\n",
       " ('👌', 3),\n",
       " ('3413', 2),\n",
       " ('xmzi3ntm2ma', 2),\n",
       " ('0726', 2),\n",
       " ('̅', 1),\n",
       " ('20170401', 1),\n",
       " ('❛', 4),\n",
       " ('2012july20', 1),\n",
       " ('20100515', 1),\n",
       " ('‿', 6),\n",
       " ('҉', 17),\n",
       " ('20120405', 1),\n",
       " ('u22', 1),\n",
       " ('762', 1),\n",
       " ('976m', 2),\n",
       " ('7g', 1),\n",
       " ('20070928', 1),\n",
       " ('b3', 7),\n",
       " ('‾', 3),\n",
       " ('101008', 1),\n",
       " ('700', 5),\n",
       " ('🌹', 2),\n",
       " ('807', 1),\n",
       " ('303', 2),\n",
       " ('20081010', 1),\n",
       " ('xmja4ndc4mdq4', 1),\n",
       " ('pigu6', 1),\n",
       " ('u2', 14),\n",
       " ('∙', 1),\n",
       " ('1100', 3),\n",
       " ('20121224jtl', 1),\n",
       " ('1108', 2),\n",
       " ('1292347', 2),\n",
       " ...]"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "unknown_others = list(\n",
    "    set(unknown_words) - set(unknown_chinese) - set(unknown_string))\n",
    "unknown_others"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "cleaned_unknown = {}\n",
    "i = 0\n",
    "for w, _ in unknown_words:\n",
    "    # 删除重复字母\n",
    "    pat1 = re.compile(r\"([a-z])(\\1{3,})\")\n",
    "    # 删除重复的字符串\n",
    "    pat2 = re.compile(r\"(\\w+)(\\1{2,}?)\")\n",
    "    t1 = pat1.sub(r\"\\1\", w)\n",
    "    t2 = pat1.sub(r\"\\1\", w)\n",
    "    if t1 != w:\n",
    "        cleaned_unknown[w] = t1\n",
    "    else:\n",
    "        if t2 != w:\n",
    "            cleaned_unknown[w] = t2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 重新处理文本"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "def cut_join(text):\n",
    "    results = []\n",
    "    space = ' '\n",
    "    seg_list = jieba.cut(text)\n",
    "    for w in seg_list:\n",
    "        if w in wv:\n",
    "            results.append(w)\n",
    "        else:\n",
    "            if is_chinese(w):\n",
    "                words = cut(w)\n",
    "                results.extend(words)\n",
    "            elif is_string(w):\n",
    "                words = cut(w)\n",
    "                unique = []\n",
    "                for w in words:\n",
    "                    if w not in unique:\n",
    "                        unique.append(w)\n",
    "                results.extend(unique)\n",
    "    return space.join(results)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['12abc43', ' ', '九百六十万平方公里', ' ', 'pbocyq5ccfs', ' ', 'woow']"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(jieba.cut('12abc43 九百六十万平方公里 pbocyq5ccfs woow'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'九百 六十万 平方公里 woo w'"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cut_join('12abc43 九百六十万平方公里 pbocyq5ccfs woow')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "comments['cleaned_comment'] = comments['comment'].apply(preprocess)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'吴京': 312,\n",
       " '意淫': 279,\n",
       " '到': 10929,\n",
       " '了': 102423,\n",
       " '脑残': 503,\n",
       " '的': 328489,\n",
       " '地步': 199,\n",
       " '看': 35779,\n",
       " '恶心': 924,\n",
       " '想': 7766,\n",
       " '吐': 608,\n",
       " '首映礼': 42,\n",
       " '太': 15646,\n",
       " '恐怖': 596,\n",
       " '这个': 10269,\n",
       " '电影': 34739,\n",
       " '不讲道理': 8,\n",
       " '完全': 4152,\n",
       " '就是': 14014,\n",
       " '在': 31277,\n",
       " '实现': 270,\n",
       " '他': 10737,\n",
       " '小': 7156,\n",
       " '粉红': 39,\n",
       " '英雄': 1714,\n",
       " '梦': 911,\n",
       " '各种': 3136,\n",
       " '装备': 80,\n",
       " '轮番': 21,\n",
       " '上场': 17,\n",
       " '视': 43,\n",
       " '物理': 63,\n",
       " '逻辑': 1415,\n",
       " '于': 1845,\n",
       " '不顾': 63,\n",
       " '不得不': 670,\n",
       " '说': 11437,\n",
       " '有钱': 207,\n",
       " '真': 6293,\n",
       " '好': 24326,\n",
       " '随意': 170,\n",
       " '胡闹': 66,\n",
       " '炒作': 76,\n",
       " '水平': 825,\n",
       " '不输': 52,\n",
       " '冯小刚': 266,\n",
       " '但': 15677,\n",
       " '小刚': 22,\n",
       " '至少': 912,\n",
       " '不会': 2684,\n",
       " '用': 4223,\n",
       " '主旋律': 923,\n",
       " '来': 5912,\n",
       " '让': 13756,\n",
       " '人': 24861,\n",
       " '不': 29017,\n",
       " '舒服': 628,\n",
       " '为了': 3506,\n",
       " '而': 6686,\n",
       " '煽情': 1173,\n",
       " '觉得': 8881,\n",
       " '是': 73132,\n",
       " '个': 7505,\n",
       " '大': 6303,\n",
       " '做作': 822,\n",
       " '谎言': 266,\n",
       " '家': 668,\n",
       " '更新': 87,\n",
       " '片子': 9578,\n",
       " '整体': 1410,\n",
       " '不如': 1796,\n",
       " '湄公河': 57,\n",
       " '行动': 187,\n",
       " '不够': 1818,\n",
       " '流畅': 699,\n",
       " '编剧': 2270,\n",
       " '有毒': 38,\n",
       " '台词': 2368,\n",
       " '尴尬': 1655,\n",
       " '刻意': 1040,\n",
       " '显得': 1098,\n",
       " '如此': 2697,\n",
       " '不合时宜': 33,\n",
       " '又': 11580,\n",
       " '多余': 311,\n",
       " '凭良心说': 3,\n",
       " '看到': 5457,\n",
       " '不像': 306,\n",
       " '战狼': 36,\n",
       " '续集': 710,\n",
       " '完虐': 7,\n",
       " '中二': 75,\n",
       " '得': 10688,\n",
       " '很': 35093,\n",
       " '犯': 322,\n",
       " '我': 50533,\n",
       " '中华': 55,\n",
       " '者': 622,\n",
       " '虽远': 23,\n",
       " '必': 166,\n",
       " '诛': 40,\n",
       " '比': 6697,\n",
       " '这句': 231,\n",
       " '话': 1848,\n",
       " '还要': 780,\n",
       " '一百倍': 14,\n",
       " '脑子': 304,\n",
       " '东西': 2289,\n",
       " '希望': 1963,\n",
       " '们': 2775,\n",
       " '都': 36425,\n",
       " '能': 10070,\n",
       " '有': 28161,\n",
       " '三星': 1977,\n",
       " '半': 2024,\n",
       " '实打实': 26,\n",
       " '分': 3112,\n",
       " '第一集': 211,\n",
       " '爱国': 136,\n",
       " '内部': 115,\n",
       " '做': 4504,\n",
       " '着': 7550,\n",
       " '置换': 18,\n",
       " '与': 9571,\n",
       " '较劲': 17,\n",
       " '第二集': 69,\n",
       " '才': 5140,\n",
       " '真正': 1483,\n",
       " '显露': 17,\n",
       " '野心': 261,\n",
       " '终于': 1775,\n",
       " '抛弃': 225,\n",
       " '李忠志': 2,\n",
       " '新增': 7,\n",
       " '外来': 29,\n",
       " '班底': 73,\n",
       " '硬件': 24,\n",
       " '实力': 194,\n",
       " '机会': 386,\n",
       " '和': 31558,\n",
       " '国际': 208,\n",
       " '接轨': 7,\n",
       " '开篇': 193,\n",
       " '水下': 33,\n",
       " '长镜头': 686,\n",
       " '诸如': 40,\n",
       " '铁丝网': 3,\n",
       " '拦截': 9,\n",
       " '弹头': 8,\n",
       " '细节': 2270,\n",
       " '设计': 1178,\n",
       " '国产': 1121,\n",
       " '动作片': 1105,\n",
       " '重新': 452,\n",
       " '封顶': 4,\n",
       " '理念': 113,\n",
       " '上': 10339,\n",
       " '它': 3329,\n",
       " '甚至': 1278,\n",
       " '做到': 538,\n",
       " '绣春刀': 38,\n",
       " '最': 8844,\n",
       " '那': 7866,\n",
       " '部分': 1973,\n",
       " '惊险': 107,\n",
       " '大气': 181,\n",
       " '引人入胜': 103,\n",
       " '结合': 427,\n",
       " '不俗': 76,\n",
       " '快': 1389,\n",
       " '剪下': 2,\n",
       " '真刀真枪': 4,\n",
       " '不禁': 137,\n",
       " '热血沸腾': 225,\n",
       " '特别': 2693,\n",
       " '弹簧床': 3,\n",
       " '架': 108,\n",
       " '挡': 94,\n",
       " '炸弹': 100,\n",
       " '空手': 9,\n",
       " '接': 424,\n",
       " '碎玻璃': 4,\n",
       " '弹匣': 2,\n",
       " '割喉': 11,\n",
       " '等': 1783,\n",
       " '帅': 1785,\n",
       " '飞起': 39,\n",
       " '就算': 905,\n",
       " '前半段': 588,\n",
       " '铺垫': 587,\n",
       " '节奏': 3514,\n",
       " '散漫': 64,\n",
       " '主角': 2141,\n",
       " '光环': 217,\n",
       " '开': 663,\n",
       " '太大': 198,\n",
       " '也': 32110,\n",
       " '不怕': 136,\n",
       " '作为': 2412,\n",
       " '一个': 17875,\n",
       " '中国': 3581,\n",
       " '两个': 2953,\n",
       " '小时': 1810,\n",
       " '弥漫着': 37,\n",
       " '强大': 691,\n",
       " '不可': 522,\n",
       " '侵犯': 24,\n",
       " '氛围': 559,\n",
       " '还是': 16857,\n",
       " '那颗': 47,\n",
       " '民族': 324,\n",
       " '自豪': 24,\n",
       " '心': 1446,\n",
       " '砰砰': 53,\n",
       " '砰': 58,\n",
       " '跳': 626,\n",
       " '不停': 436,\n",
       " '冷峰': 1,\n",
       " '这部': 7640,\n",
       " '里': 8353,\n",
       " '即': 411,\n",
       " '像': 5869,\n",
       " '成龙': 883,\n",
       " '杰': 174,\n",
       " '森斯坦': 150,\n",
       " '森': 289,\n",
       " '体制': 213,\n",
       " '外': 571,\n",
       " '同': 668,\n",
       " '类型': 1509,\n",
       " '总是': 1988,\n",
       " '代表': 471,\n",
       " '个人': 1742,\n",
       " '无能': 186,\n",
       " '政府': 244,\n",
       " '需要': 1821,\n",
       " '求助于': 4,\n",
       " '这些': 1479,\n",
       " '才能': 836,\n",
       " '解决': 280,\n",
       " '难题': 53,\n",
       " '体现': 435,\n",
       " '价值': 334,\n",
       " '所以': 2768,\n",
       " '照抄': 25,\n",
       " '这种': 6498,\n",
       " '模式': 481,\n",
       " '实际上': 187,\n",
       " '问题': 2401,\n",
       " '我们': 6039,\n",
       " '以前': 1095,\n",
       " '嘲笑': 90,\n",
       " '英雄主义': 283,\n",
       " '却': 6297,\n",
       " '没想到': 973,\n",
       " '捆绑': 34,\n",
       " '爱国主义': 87,\n",
       " '全能': 17,\n",
       " '战士': 111,\n",
       " '更加': 722,\n",
       " '难以': 482,\n",
       " '下咽': 11,\n",
       " '多': 9864,\n",
       " '无脑': 261,\n",
       " '信': 268,\n",
       " '戏': 4504,\n",
       " '对': 10364,\n",
       " '路': 681,\n",
       " '转': 241,\n",
       " '粉': 732,\n",
       " '最后': 9966,\n",
       " '彩蛋': 561,\n",
       " '没有': 14763,\n",
       " '理由': 374,\n",
       " '期待': 1433,\n",
       " '下': 3700,\n",
       " '一部': 9687,\n",
       " '假': 896,\n",
       " '嗨': 214,\n",
       " '几处': 175,\n",
       " '情节': 3993,\n",
       " '设置': 521,\n",
       " '过于': 1025,\n",
       " '彰显': 67,\n",
       " '国家': 828,\n",
       " '自豪感': 5,\n",
       " '稍显': 201,\n",
       " '突兀': 390,\n",
       " '爽': 747,\n",
       " '片': 7936,\n",
       " '打戏': 363,\n",
       " '挺': 7039,\n",
       " '燃': 414,\n",
       " '但是': 7720,\n",
       " '故事': 14999,\n",
       " '一般': 3682,\n",
       " '达康': 13,\n",
       " '书记': 19,\n",
       " '合适': 297,\n",
       " '角色': 3735,\n",
       " '赵': 73,\n",
       " '东来': 11,\n",
       " '倒': 1759,\n",
       " '张瀚': 6,\n",
       " '太太': 199,\n",
       " '违': 89,\n",
       " '分钟': 1841,\n",
       " '穿越': 528,\n",
       " '回': 472,\n",
       " '偶像剧': 112,\n",
       " '接到': 24,\n",
       " '非洲': 134,\n",
       " '卧底': 189,\n",
       " '冷锋': 14,\n",
       " '报告': 26,\n",
       " '丁义珍': 6,\n",
       " '现在': 3750,\n",
       " '请求': 13,\n",
       " '抓捕': 8,\n",
       " '李达康': 4,\n",
       " '这件': 111,\n",
       " '事先': 24,\n",
       " '不要': 2509,\n",
       " '声张': 4,\n",
       " '别': 1021,\n",
       " '省厅': 3,\n",
       " '知道': 5386,\n",
       " '就': 25849,\n",
       " '你': 17305,\n",
       " '一起': 2803,\n",
       " '去': 7743,\n",
       " '加上': 901,\n",
       " '同志': 267,\n",
       " '三人': 140,\n",
       " '逮捕': 12,\n",
       " '这次': 940,\n",
       " '行': 965,\n",
       " '叫': 2115,\n",
       " '吧': 10860,\n",
       " '拍': 8828,\n",
       " '喜剧': 2741,\n",
       " '整个': 1787,\n",
       " '感觉': 8012,\n",
       " '搞笑': 2457,\n",
       " '这么': 6914,\n",
       " '打': 4207,\n",
       " '过': 3906,\n",
       " '徐晓冬': 1,\n",
       " '么': 3721,\n",
       " '往': 288,\n",
       " '一处': 77,\n",
       " '劲': 571,\n",
       " '使': 501,\n",
       " '梦想': 1192,\n",
       " '看吧': 154,\n",
       " '第一部': 2310,\n",
       " '好太多': 93,\n",
       " '谢谢': 239,\n",
       " '美': 3242,\n",
       " '队': 233,\n",
       " '动作': 3382,\n",
       " '指导': 121,\n",
       " '这': 18121,\n",
       " '火': 249,\n",
       " '没见识': 5,\n",
       " '开头': 1343,\n",
       " '长': 1547,\n",
       " '对决': 215,\n",
       " '可算': 12,\n",
       " '华语': 297,\n",
       " '顶尖': 28,\n",
       " '存在': 1295,\n",
       " '驱逐舰': 4,\n",
       " '导弹': 25,\n",
       " '坦克': 215,\n",
       " '商业片': 544,\n",
       " '狂': 215,\n",
       " '镜头': 4388,\n",
       " '运用': 397,\n",
       " '笑': 3895,\n",
       " '点': 4626,\n",
       " '插入': 85,\n",
       " '好莱坞': 1262,\n",
       " '爆米花': 560,\n",
       " '功': 68,\n",
       " '不过': 6535,\n",
       " '从头': 343,\n",
       " '打到': 67,\n",
       " '尾': 418,\n",
       " '拼': 384,\n",
       " '虽然': 5822,\n",
       " '有略': 4,\n",
       " '乱': 842,\n",
       " '时': 3073,\n",
       " '因为': 4085,\n",
       " '没': 12178,\n",
       " '啥': 2023,\n",
       " '期望值': 56,\n",
       " '被': 10004,\n",
       " '吓了一跳': 10,\n",
       " '吴刚': 10,\n",
       " '谦和': 6,\n",
       " '丁海峰': 1,\n",
       " '老': 3687,\n",
       " '三位': 111,\n",
       " '炖': 64,\n",
       " '烂熟': 9,\n",
       " '牛筋': 1,\n",
       " '嚼': 92,\n",
       " '用心': 637,\n",
       " '啊': 20660,\n",
       " '导演': 8665,\n",
       " '小看': 21,\n",
       " '确实': 2167,\n",
       " '下功夫': 27,\n",
       " '拉': 589,\n",
       " '借鉴': 174,\n",
       " '至于': 474,\n",
       " '大家': 1366,\n",
       " '比较': 3302,\n",
       " '反感': 144,\n",
       " '情绪': 1075,\n",
       " '那些': 2407,\n",
       " '桥段': 954,\n",
       " '必备': 58,\n",
       " '稍微': 405,\n",
       " '一点': 2999,\n",
       " '还': 17769,\n",
       " '可以': 8890,\n",
       " '接受': 925,\n",
       " '最好': 1951,\n",
       " '地方': 2173,\n",
       " '掌握': 151,\n",
       " '张弛': 69,\n",
       " '有度': 61,\n",
       " '这点': 285,\n",
       " '难得': 879,\n",
       " '一直': 3317,\n",
       " '脑子里': 62,\n",
       " '回响': 39,\n",
       " '片头': 357,\n",
       " '海里': 23,\n",
       " '那场': 491,\n",
       " '完': 5358,\n",
       " '呆': 318,\n",
       " '下去': 898,\n",
       " '太假': 193,\n",
       " '提前': 195,\n",
       " '离场': 130,\n",
       " '好看': 7751,\n",
       " '演技': 5621,\n",
       " '棒': 1208,\n",
       " '符合': 413,\n",
       " '反而': 828,\n",
       " '更': 6351,\n",
       " '差': 2073,\n",
       " '这一': 183,\n",
       " '放之四海而皆准': 3,\n",
       " '规律': 30,\n",
       " '场面': 2295,\n",
       " '越做越': 10,\n",
       " '然而': 793,\n",
       " '伴随': 124,\n",
       " '特效': 1974,\n",
       " '升级': 123,\n",
       " '叙事': 2098,\n",
       " '变得': 605,\n",
       " '非常': 4843,\n",
       " '凌乱': 212,\n",
       " '格局': 339,\n",
       " '颇': 395,\n",
       " '拍成': 473,\n",
       " '黑鹰坠落': 30,\n",
       " '结果': 1813,\n",
       " '撑': 287,\n",
       " '死': 4408,\n",
       " '最多': 108,\n",
       " '只是': 4107,\n",
       " '官方': 66,\n",
       " '版': 2599,\n",
       " '敢死队': 64,\n",
       " '但论': 8,\n",
       " '自我': 772,\n",
       " '角色定位': 15,\n",
       " '能力': 617,\n",
       " '远': 524,\n",
       " '如同': 318,\n",
       " '演员': 5510,\n",
       " '出身': 113,\n",
       " '甄子丹': 387,\n",
       " '喜欢': 13814,\n",
       " '不是': 8796,\n",
       " '装傻': 34,\n",
       " '真傻': 18,\n",
       " '要不是': 329,\n",
       " '真的': 7906,\n",
       " '别的': 445,\n",
       " '可': 2236,\n",
       " '肯定': 756,\n",
       " '选': 397,\n",
       " '直': 407,\n",
       " '男': 2273,\n",
       " '癌': 206,\n",
       " '令人发指': 105,\n",
       " '所有': 2058,\n",
       " '剧情': 11609,\n",
       " '走向': 374,\n",
       " '九十年代': 112,\n",
       " '那套': 56,\n",
       " '照搬': 101,\n",
       " '审美': 269,\n",
       " '事儿': 324,\n",
       " '一时': 106,\n",
       " '会儿': 20,\n",
       " '培养': 50,\n",
       " '出来': 3415,\n",
       " '整部': 1077,\n",
       " '延续': 368,\n",
       " '风格': 2810,\n",
       " '热血': 739,\n",
       " '要': 8463,\n",
       " '不错': 10243,\n",
       " '适合': 1841,\n",
       " '演': 4618,\n",
       " '军人': 81,\n",
       " '之前': 1517,\n",
       " '片段': 555,\n",
       " '念': 176,\n",
       " '劲儿': 154,\n",
       " '来说': 1554,\n",
       " '张翰': 53,\n",
       " '一': 4079,\n",
       " '一股': 338,\n",
       " '雷阵雨': 3,\n",
       " '画风': 447,\n",
       " '目': 159,\n",
       " '瞪': 53,\n",
       " '狗': 876,\n",
       " '瘠薄': 3,\n",
       " '人牛': 5,\n",
       " 'b': 169,\n",
       " '硬道理': 18,\n",
       " '隔壁': 128,\n",
       " '建军': 6,\n",
       " '大爷': 198,\n",
       " '你们': 1878,\n",
       " '场景': 1666,\n",
       " '战斗': 355,\n",
       " '全线': 14,\n",
       " '打斗': 1165,\n",
       " '游走': 49,\n",
       " '审查': 133,\n",
       " '红线': 12,\n",
       " '边界': 27,\n",
       " '政治': 1057,\n",
       " '安全': 124,\n",
       " '缝隙': 17,\n",
       " '部': 1082,\n",
       " '极具': 157,\n",
       " '煽动': 34,\n",
       " '大片': 1035,\n",
       " '制作': 1219,\n",
       " '精良': 215,\n",
       " '影片': 4879,\n",
       " '请': 1017,\n",
       " '多来': 11,\n",
       " '胶卷': 17,\n",
       " '过度': 269,\n",
       " '部队': 81,\n",
       " '没太多': 38,\n",
       " '展示': 353,\n",
       " '死去': 247,\n",
       " '反正': 625,\n",
       " '吸引': 807,\n",
       " '冲': 320,\n",
       " '为什么': 3226,\n",
       " '鄙视': 87,\n",
       " '敢': 389,\n",
       " '开拓': 23,\n",
       " '允许': 69,\n",
       " '他们': 3130,\n",
       " '再': 5510,\n",
       " '直到': 337,\n",
       " '更好': 1000,\n",
       " '拍出': 580,\n",
       " '出彩': 1005,\n",
       " '呢': 4500,\n",
       " '火爆': 201,\n",
       " '本片': 1954,\n",
       " '必将': 38,\n",
       " '燃爆': 58,\n",
       " '暑期': 54,\n",
       " '厉害': 847,\n",
       " '身为': 125,\n",
       " '武打': 320,\n",
       " '高标准': 3,\n",
       " '枪战': 696,\n",
       " '为': 5005,\n",
       " '点赞': 113,\n",
       " '热血男儿': 2,\n",
       " '荷尔蒙': 159,\n",
       " '爆发': 406,\n",
       " '给': 11149,\n",
       " '星': 4124,\n",
       " '血战': 39,\n",
       " '钢锯': 19,\n",
       " '岭': 36,\n",
       " '会': 8039,\n",
       " '歌颂': 73,\n",
       " '宗教': 719,\n",
       " '情怀': 958,\n",
       " '超越': 578,\n",
       " '政权': 33,\n",
       " '当': 3063,\n",
       " '只': 3392,\n",
       " '明显': 1200,\n",
       " '低': 825,\n",
       " '层次': 173,\n",
       " '充满': 1395,\n",
       " '现实': 2239,\n",
       " '乃至': 76,\n",
       " '投机': 29,\n",
       " '考量': 29,\n",
       " '高下': 49,\n",
       " '立': 116,\n",
       " '见': 810,\n",
       " '请问': 99,\n",
       " '脑': 480,\n",
       " '残': 400,\n",
       " '火箭炮': 6,\n",
       " '吗': 4966,\n",
       " '傲气': 6,\n",
       " '雄鹰': 3,\n",
       " '第一': 531,\n",
       " '滴血': 26,\n",
       " '算是': 1689,\n",
       " '国内': 510,\n",
       " '准': 100,\n",
       " '钱': 1016,\n",
       " '花': 541,\n",
       " '有效': 62,\n",
       " '气魄': 22,\n",
       " '创作': 266,\n",
       " '足够': 607,\n",
       " '真诚': 294,\n",
       " '人物': 3997,\n",
       " '连': 1887,\n",
       " '可爱': 3080,\n",
       " '如果': 3614,\n",
       " '当年': 1479,\n",
       " '那样': 1064,\n",
       " '膨胀': 34,\n",
       " '银幕': 474,\n",
       " '独占': 5,\n",
       " '聚光灯': 9,\n",
       " '走': 1876,\n",
       " '扪心自问': 8,\n",
       " '没法': 394,\n",
       " '评价': 588,\n",
       " '全片': 1218,\n",
       " '靠': 1264,\n",
       " '文戏': 309,\n",
       " '扯淡': 236,\n",
       " '女主角': 1429,\n",
       " '毫无': 1358,\n",
       " '必要': 477,\n",
       " '只要': 716,\n",
       " '开挂': 111,\n",
       " '牛': 2002,\n",
       " '逼': 3979,\n",
       " '之处': 121,\n",
       " '在于': 671,\n",
       " '透露': 122,\n",
       " '极': 354,\n",
       " '强烈': 550,\n",
       " '意识形态': 117,\n",
       " '枷锁': 32,\n",
       " '祖国': 67,\n",
       " '面前': 411,\n",
       " '一切': 1962,\n",
       " '反动派': 5,\n",
       " '纸老虎': 13,\n",
       " '人开': 3,\n",
       " '挂': 310,\n",
       " '团灭': 13,\n",
       " '合情合理': 33,\n",
       " '两星': 763,\n",
       " '鼓励': 462,\n",
       " '其他': 1757,\n",
       " '般': 893,\n",
       " '看点': 512,\n",
       " '有点': 7632,\n",
       " '手接': 2,\n",
       " '哈哈哈': 1620,\n",
       " '从': 4408,\n",
       " '之后': 2323,\n",
       " '炸': 345,\n",
       " '翻': 395,\n",
       " '一下': 1856,\n",
       " '四星': 1087,\n",
       " '当时': 1066,\n",
       " '其实': 5787,\n",
       " '完成度': 199,\n",
       " '接近': 310,\n",
       " '每个': 2145,\n",
       " '步骤': 8,\n",
       " '顺滑': 6,\n",
       " '任何': 1177,\n",
       " '出人意料': 76,\n",
       " '是因为': 882,\n",
       " '看看': 1629,\n",
       " '最近': 763,\n",
       " '世界': 3435,\n",
       " '抱歉': 76,\n",
       " '影院': 832,\n",
       " '起来': 2069,\n",
       " '魔幻': 373,\n",
       " '当然': 1233,\n",
       " '强拆': 38,\n",
       " '现实感': 16,\n",
       " '一幕': 481,\n",
       " '开场': 456,\n",
       " '搏斗': 38,\n",
       " '从来': 313,\n",
       " '其它': 192,\n",
       " '拍摄': 964,\n",
       " '难度': 72,\n",
       " '同时': 732,\n",
       " '技能': 81,\n",
       " '方面': 809,\n",
       " '要求': 319,\n",
       " '回来': 524,\n",
       " '搜': 65,\n",
       " '游泳': 38,\n",
       " '潜水': 9,\n",
       " '滑雪': 14,\n",
       " '飞机': 551,\n",
       " '射击': 41,\n",
       " '各项': 6,\n",
       " '特意': 108,\n",
       " '特种部队': 36,\n",
       " '当过': 7,\n",
       " '月': 617,\n",
       " '兵': 62,\n",
       " '佩服': 344,\n",
       " '这样': 6670,\n",
       " '星半': 149,\n",
       " '结束': 891,\n",
       " '掌声': 52,\n",
       " '出现': 1635,\n",
       " '近期': 118,\n",
       " '少见': 110,\n",
       " '一粒': 17,\n",
       " '大补丸': 1,\n",
       " '有人': 938,\n",
       " '吃': 1332,\n",
       " '开心': 773,\n",
       " '补大': 1,\n",
       " '从白': 1,\n",
       " '黑': 1228,\n",
       " '字幕': 769,\n",
       " '展现': 668,\n",
       " '超级': 1297,\n",
       " '糙': 151,\n",
       " '猛': 176,\n",
       " '媲美': 80,\n",
       " '终结者': 69,\n",
       " '无': 2120,\n",
       " '亮点': 1710,\n",
       " '变': 791,\n",
       " '谐星': 25,\n",
       " '掌控': 194,\n",
       " '逼近': 26,\n",
       " '不住': 254,\n",
       " '边缘': 168,\n",
       " '带': 1643,\n",
       " '感': 2443,\n",
       " '拳拳': 106,\n",
       " '肉': 396,\n",
       " '超爽': 13,\n",
       " '聪明': 350,\n",
       " '鸡': 255,\n",
       " '贼': 133,\n",
       " '一面': 303,\n",
       " '旗下': 9,\n",
       " '呈现': 436,\n",
       " '一出': 296,\n",
       " '重工业': 4,\n",
       " '娱乐': 495,\n",
       " '调控': 9,\n",
       " '说教': 313,\n",
       " '比例': 43,\n",
       " '尺度': 197,\n",
       " '大众': 203,\n",
       " '接纳': 26,\n",
       " '把握': 370,\n",
       " '微妙': 224,\n",
       " '其中': 759,\n",
       " '一些': 2105,\n",
       " '奇侠': 10,\n",
       " '化': 751,\n",
       " '内容': 1011,\n",
       " '比如': 613,\n",
       " '玻璃': 101,\n",
       " '碴': 12,\n",
       " '子': 666,\n",
       " '飞镖': 14,\n",
       " '杀敌': 10,\n",
       " '一类': 108,\n",
       " '只不过': 325,\n",
       " '遮盖': 13,\n",
       " '掉': 817,\n",
       " '老爹': 81,\n",
       " '演过': 76,\n",
       " '美剧': 93,\n",
       " '搏击': 76,\n",
       " '王国': 47,\n",
       " '力荐': 104,\n",
       " '那部': 143,\n",
       " '为啥': 477,\n",
       " '奇异': 83,\n",
       " '恩典': 2,\n",
       " '配乐': 2802,\n",
       " '画': 330,\n",
       " '内': 717,\n",
       " '男生': 188,\n",
       " '的话': 1392,\n",
       " '应该': 3298,\n",
       " '刺激': 620,\n",
       " '肾上腺素': 67,\n",
       " '女生': 334,\n",
       " '对龙': 2,\n",
       " '小云': 2,\n",
       " '感情': 1738,\n",
       " '十分': 830,\n",
       " '打动': 598,\n",
       " '模仿': 570,\n",
       " '许多': 556,\n",
       " '怎么': 4811,\n",
       " '玩': 1408,\n",
       " '一股脑': 15,\n",
       " '堆': 70,\n",
       " '槽': 630,\n",
       " '几位': 177,\n",
       " '血': 384,\n",
       " '厚到': 2,\n",
       " '科幻': 699,\n",
       " '级别': 187,\n",
       " '重复': 359,\n",
       " '满血': 15,\n",
       " '红血': 1,\n",
       " '中毒': 17,\n",
       " '极速': 34,\n",
       " '回血': 5,\n",
       " '爆': 1053,\n",
       " '种': 188,\n",
       " '打通': 14,\n",
       " '全场': 348,\n",
       " '太过': 636,\n",
       " '投机取巧': 10,\n",
       " '穿': 590,\n",
       " '迈克尔': 84,\n",
       " '贝都': 1,\n",
       " '不受': 11,\n",
       " '待见': 65,\n",
       " '国片': 21,\n",
       " '前仆后继': 9,\n",
       " '爆炸': 371,\n",
       " 'high': 203,\n",
       " '瞎': 666,\n",
       " '没用': 136,\n",
       " '女人': 2365,\n",
       " '缺': 270,\n",
       " '男人': 2372,\n",
       " '征服': 75,\n",
       " '美国': 2556,\n",
       " '不行': 1061,\n",
       " '全都': 247,\n",
       " '跟': 4241,\n",
       " '跳墙': 3,\n",
       " '一样': 4735,\n",
       " '拯救': 678,\n",
       " '国产片': 357,\n",
       " '以': 2020,\n",
       " '中印': 3,\n",
       " '局势': 11,\n",
       " '对比': 464,\n",
       " '假想': 8,\n",
       " '真是': 6447,\n",
       " '讽刺': 765,\n",
       " '谄媚': 19,\n",
       " '军旅': 11,\n",
       " '题材': 2762,\n",
       " '质感': 368,\n",
       " '国外': 150,\n",
       " '精彩': 2954,\n",
       " '看着': 1928,\n",
       " '有力': 226,\n",
       " '必须': 693,\n",
       " '安利': 43,\n",
       " '张': 284,\n",
       " '翰': 11,\n",
       " '简直': 2427,\n",
       " '承包': 28,\n",
       " '笑点': 1169,\n",
       " '量身定做': 34,\n",
       " '彭于': 223,\n",
       " '晏': 234,\n",
       " '可演': 6,\n",
       " '不来': 139,\n",
       " '不少': 1124,\n",
       " '漂移': 19,\n",
       " '无人机': 41,\n",
       " '突袭': 59,\n",
       " '直升机': 68,\n",
       " '坠': 18,\n",
       " '露': 278,\n",
       " '肉搏': 96,\n",
       " '军舰': 13,\n",
       " '发射': 18,\n",
       " '叛乱': 4,\n",
       " '国际化': 22,\n",
       " '视角': 778,\n",
       " '标配': 55,\n",
       " '饰演': 260,\n",
       " '深入人心': 49,\n",
       " '搏命': 43,\n",
       " '精神': 992,\n",
       " '当下': 254,\n",
       " '第三部': 307,\n",
       " '表白': 124,\n",
       " '典型': 884,\n",
       " '方式': 1254,\n",
       " '每次': 638,\n",
       " '猜': 573,\n",
       " '诶': 264,\n",
       " '问': 456,\n",
       " '王牌': 84,\n",
       " '特工': 401,\n",
       " '那么': 6603,\n",
       " '杀人': 525,\n",
       " '经过': 190,\n",
       " '艺术': 694,\n",
       " '处理': 1053,\n",
       " '直接': 1101,\n",
       " '删': 141,\n",
       " '血腥': 585,\n",
       " '屠杀': 63,\n",
       " '赤裸裸': 113,\n",
       " '大段': 149,\n",
       " '正确': 300,\n",
       " '庇': 4,\n",
       " '衣': 40,\n",
       " '意料之中': 88,\n",
       " '意料之外': 67,\n",
       " '惊喜': 1325,\n",
       " '属于': 748,\n",
       " '狼性': 4,\n",
       " '军魂': 2,\n",
       " '几个': 1930,\n",
       " '网红': 60,\n",
       " '弹弹琴': 2,\n",
       " '大国': 36,\n",
       " '气象': 13,\n",
       " '满屏': 79,\n",
       " '告诉': 936,\n",
       " '吴': 142,\n",
       " '迪塞尔': 58,\n",
       " '如入无人之境': 4,\n",
       " '亿': 123,\n",
       " '大陆': 716,\n",
       " '一刻': 195,\n",
       " '集体': 274,\n",
       " '勃起': 11,\n",
       " '离开': 469,\n",
       " '影厅': 29,\n",
       " '屌丝': 380,\n",
       " '同样': 867,\n",
       " '开始': 3351,\n",
       " '前': 2833,\n",
       " '屌': 347,\n",
       " '一万倍': 18,\n",
       " '一次': 2154,\n",
       " '标准': 540,\n",
       " '打造': 67,\n",
       " '美式': 303,\n",
       " '不可逆转': 5,\n",
       " '缺点': 340,\n",
       " '笑料': 394,\n",
       " '一定': 1932,\n",
       " '程度': 587,\n",
       " '地': 3369,\n",
       " '破坏': 209,\n",
       " '节奏感': 179,\n",
       " '斥': 4,\n",
       " '巨资': 6,\n",
       " '炮制': 25,\n",
       " '有所': 210,\n",
       " '体验': 382,\n",
       " '自然': 1174,\n",
       " ...}"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vocab = build_vocab(comments['cleaned_comment'])\n",
    "vocab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "词汇表中 100.00% 的单词有词向量\n",
      "评论的所有单词中 100.00% 的单词有词向量\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[]"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "check_coverage(vocab, wv)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'unkown': 0,\n",
       " '吴京': 1,\n",
       " '意淫': 2,\n",
       " '到': 3,\n",
       " '了': 4,\n",
       " '脑残': 5,\n",
       " '的': 6,\n",
       " '地步': 7,\n",
       " '看': 8,\n",
       " '恶心': 9,\n",
       " '想': 10,\n",
       " '吐': 11,\n",
       " '首映礼': 12,\n",
       " '太': 13,\n",
       " '恐怖': 14,\n",
       " '这个': 15,\n",
       " '电影': 16,\n",
       " '不讲道理': 17,\n",
       " '完全': 18,\n",
       " '就是': 19,\n",
       " '在': 20,\n",
       " '实现': 21,\n",
       " '他': 22,\n",
       " '小': 23,\n",
       " '粉红': 24,\n",
       " '英雄': 25,\n",
       " '梦': 26,\n",
       " '各种': 27,\n",
       " '装备': 28,\n",
       " '轮番': 29,\n",
       " '上场': 30,\n",
       " '视': 31,\n",
       " '物理': 32,\n",
       " '逻辑': 33,\n",
       " '于': 34,\n",
       " '不顾': 35,\n",
       " '不得不': 36,\n",
       " '说': 37,\n",
       " '有钱': 38,\n",
       " '真': 39,\n",
       " '好': 40,\n",
       " '随意': 41,\n",
       " '胡闹': 42,\n",
       " '炒作': 43,\n",
       " '水平': 44,\n",
       " '不输': 45,\n",
       " '冯小刚': 46,\n",
       " '但': 47,\n",
       " '小刚': 48,\n",
       " '至少': 49,\n",
       " '不会': 50,\n",
       " '用': 51,\n",
       " '主旋律': 52,\n",
       " '来': 53,\n",
       " '让': 54,\n",
       " '人': 55,\n",
       " '不': 56,\n",
       " '舒服': 57,\n",
       " '为了': 58,\n",
       " '而': 59,\n",
       " '煽情': 60,\n",
       " '觉得': 61,\n",
       " '是': 62,\n",
       " '个': 63,\n",
       " '大': 64,\n",
       " '做作': 65,\n",
       " '谎言': 66,\n",
       " '家': 67,\n",
       " '更新': 68,\n",
       " '片子': 69,\n",
       " '整体': 70,\n",
       " '不如': 71,\n",
       " '湄公河': 72,\n",
       " '行动': 73,\n",
       " '不够': 74,\n",
       " '流畅': 75,\n",
       " '编剧': 76,\n",
       " '有毒': 77,\n",
       " '台词': 78,\n",
       " '尴尬': 79,\n",
       " '刻意': 80,\n",
       " '显得': 81,\n",
       " '如此': 82,\n",
       " '不合时宜': 83,\n",
       " '又': 84,\n",
       " '多余': 85,\n",
       " '凭良心说': 86,\n",
       " '看到': 87,\n",
       " '不像': 88,\n",
       " '战狼': 89,\n",
       " '续集': 90,\n",
       " '完虐': 91,\n",
       " '中二': 92,\n",
       " '得': 93,\n",
       " '很': 94,\n",
       " '犯': 95,\n",
       " '我': 96,\n",
       " '中华': 97,\n",
       " '者': 98,\n",
       " '虽远': 99,\n",
       " '必': 100,\n",
       " '诛': 101,\n",
       " '比': 102,\n",
       " '这句': 103,\n",
       " '话': 104,\n",
       " '还要': 105,\n",
       " '一百倍': 106,\n",
       " '脑子': 107,\n",
       " '东西': 108,\n",
       " '希望': 109,\n",
       " '们': 110,\n",
       " '都': 111,\n",
       " '能': 112,\n",
       " '有': 113,\n",
       " '三星': 114,\n",
       " '半': 115,\n",
       " '实打实': 116,\n",
       " '分': 117,\n",
       " '第一集': 118,\n",
       " '爱国': 119,\n",
       " '内部': 120,\n",
       " '做': 121,\n",
       " '着': 122,\n",
       " '置换': 123,\n",
       " '与': 124,\n",
       " '较劲': 125,\n",
       " '第二集': 126,\n",
       " '才': 127,\n",
       " '真正': 128,\n",
       " '显露': 129,\n",
       " '野心': 130,\n",
       " '终于': 131,\n",
       " '抛弃': 132,\n",
       " '李忠志': 133,\n",
       " '新增': 134,\n",
       " '外来': 135,\n",
       " '班底': 136,\n",
       " '硬件': 137,\n",
       " '实力': 138,\n",
       " '机会': 139,\n",
       " '和': 140,\n",
       " '国际': 141,\n",
       " '接轨': 142,\n",
       " '开篇': 143,\n",
       " '水下': 144,\n",
       " '长镜头': 145,\n",
       " '诸如': 146,\n",
       " '铁丝网': 147,\n",
       " '拦截': 148,\n",
       " '弹头': 149,\n",
       " '细节': 150,\n",
       " '设计': 151,\n",
       " '国产': 152,\n",
       " '动作片': 153,\n",
       " '重新': 154,\n",
       " '封顶': 155,\n",
       " '理念': 156,\n",
       " '上': 157,\n",
       " '它': 158,\n",
       " '甚至': 159,\n",
       " '做到': 160,\n",
       " '绣春刀': 161,\n",
       " '最': 162,\n",
       " '那': 163,\n",
       " '部分': 164,\n",
       " '惊险': 165,\n",
       " '大气': 166,\n",
       " '引人入胜': 167,\n",
       " '结合': 168,\n",
       " '不俗': 169,\n",
       " '快': 170,\n",
       " '剪下': 171,\n",
       " '真刀真枪': 172,\n",
       " '不禁': 173,\n",
       " '热血沸腾': 174,\n",
       " '特别': 175,\n",
       " '弹簧床': 176,\n",
       " '架': 177,\n",
       " '挡': 178,\n",
       " '炸弹': 179,\n",
       " '空手': 180,\n",
       " '接': 181,\n",
       " '碎玻璃': 182,\n",
       " '弹匣': 183,\n",
       " '割喉': 184,\n",
       " '等': 185,\n",
       " '帅': 186,\n",
       " '飞起': 187,\n",
       " '就算': 188,\n",
       " '前半段': 189,\n",
       " '铺垫': 190,\n",
       " '节奏': 191,\n",
       " '散漫': 192,\n",
       " '主角': 193,\n",
       " '光环': 194,\n",
       " '开': 195,\n",
       " '太大': 196,\n",
       " '也': 197,\n",
       " '不怕': 198,\n",
       " '作为': 199,\n",
       " '一个': 200,\n",
       " '中国': 201,\n",
       " '两个': 202,\n",
       " '小时': 203,\n",
       " '弥漫着': 204,\n",
       " '强大': 205,\n",
       " '不可': 206,\n",
       " '侵犯': 207,\n",
       " '氛围': 208,\n",
       " '还是': 209,\n",
       " '那颗': 210,\n",
       " '民族': 211,\n",
       " '自豪': 212,\n",
       " '心': 213,\n",
       " '砰砰': 214,\n",
       " '砰': 215,\n",
       " '跳': 216,\n",
       " '不停': 217,\n",
       " '冷峰': 218,\n",
       " '这部': 219,\n",
       " '里': 220,\n",
       " '即': 221,\n",
       " '像': 222,\n",
       " '成龙': 223,\n",
       " '杰': 224,\n",
       " '森斯坦': 225,\n",
       " '森': 226,\n",
       " '体制': 227,\n",
       " '外': 228,\n",
       " '同': 229,\n",
       " '类型': 230,\n",
       " '总是': 231,\n",
       " '代表': 232,\n",
       " '个人': 233,\n",
       " '无能': 234,\n",
       " '政府': 235,\n",
       " '需要': 236,\n",
       " '求助于': 237,\n",
       " '这些': 238,\n",
       " '才能': 239,\n",
       " '解决': 240,\n",
       " '难题': 241,\n",
       " '体现': 242,\n",
       " '价值': 243,\n",
       " '所以': 244,\n",
       " '照抄': 245,\n",
       " '这种': 246,\n",
       " '模式': 247,\n",
       " '实际上': 248,\n",
       " '问题': 249,\n",
       " '我们': 250,\n",
       " '以前': 251,\n",
       " '嘲笑': 252,\n",
       " '英雄主义': 253,\n",
       " '却': 254,\n",
       " '没想到': 255,\n",
       " '捆绑': 256,\n",
       " '爱国主义': 257,\n",
       " '全能': 258,\n",
       " '战士': 259,\n",
       " '更加': 260,\n",
       " '难以': 261,\n",
       " '下咽': 262,\n",
       " '多': 263,\n",
       " '无脑': 264,\n",
       " '信': 265,\n",
       " '戏': 266,\n",
       " '对': 267,\n",
       " '路': 268,\n",
       " '转': 269,\n",
       " '粉': 270,\n",
       " '最后': 271,\n",
       " '彩蛋': 272,\n",
       " '没有': 273,\n",
       " '理由': 274,\n",
       " '期待': 275,\n",
       " '下': 276,\n",
       " '一部': 277,\n",
       " '假': 278,\n",
       " '嗨': 279,\n",
       " '几处': 280,\n",
       " '情节': 281,\n",
       " '设置': 282,\n",
       " '过于': 283,\n",
       " '彰显': 284,\n",
       " '国家': 285,\n",
       " '自豪感': 286,\n",
       " '稍显': 287,\n",
       " '突兀': 288,\n",
       " '爽': 289,\n",
       " '片': 290,\n",
       " '打戏': 291,\n",
       " '挺': 292,\n",
       " '燃': 293,\n",
       " '但是': 294,\n",
       " '故事': 295,\n",
       " '一般': 296,\n",
       " '达康': 297,\n",
       " '书记': 298,\n",
       " '合适': 299,\n",
       " '角色': 300,\n",
       " '赵': 301,\n",
       " '东来': 302,\n",
       " '倒': 303,\n",
       " '张瀚': 304,\n",
       " '太太': 305,\n",
       " '违': 306,\n",
       " '分钟': 307,\n",
       " '穿越': 308,\n",
       " '回': 309,\n",
       " '偶像剧': 310,\n",
       " '接到': 311,\n",
       " '非洲': 312,\n",
       " '卧底': 313,\n",
       " '冷锋': 314,\n",
       " '报告': 315,\n",
       " '丁义珍': 316,\n",
       " '现在': 317,\n",
       " '请求': 318,\n",
       " '抓捕': 319,\n",
       " '李达康': 320,\n",
       " '这件': 321,\n",
       " '事先': 322,\n",
       " '不要': 323,\n",
       " '声张': 324,\n",
       " '别': 325,\n",
       " '省厅': 326,\n",
       " '知道': 327,\n",
       " '就': 328,\n",
       " '你': 329,\n",
       " '一起': 330,\n",
       " '去': 331,\n",
       " '加上': 332,\n",
       " '同志': 333,\n",
       " '三人': 334,\n",
       " '逮捕': 335,\n",
       " '这次': 336,\n",
       " '行': 337,\n",
       " '叫': 338,\n",
       " '吧': 339,\n",
       " '拍': 340,\n",
       " '喜剧': 341,\n",
       " '整个': 342,\n",
       " '感觉': 343,\n",
       " '搞笑': 344,\n",
       " '这么': 345,\n",
       " '打': 346,\n",
       " '过': 347,\n",
       " '徐晓冬': 348,\n",
       " '么': 349,\n",
       " '往': 350,\n",
       " '一处': 351,\n",
       " '劲': 352,\n",
       " '使': 353,\n",
       " '梦想': 354,\n",
       " '看吧': 355,\n",
       " '第一部': 356,\n",
       " '好太多': 357,\n",
       " '谢谢': 358,\n",
       " '美': 359,\n",
       " '队': 360,\n",
       " '动作': 361,\n",
       " '指导': 362,\n",
       " '这': 363,\n",
       " '火': 364,\n",
       " '没见识': 365,\n",
       " '开头': 366,\n",
       " '长': 367,\n",
       " '对决': 368,\n",
       " '可算': 369,\n",
       " '华语': 370,\n",
       " '顶尖': 371,\n",
       " '存在': 372,\n",
       " '驱逐舰': 373,\n",
       " '导弹': 374,\n",
       " '坦克': 375,\n",
       " '商业片': 376,\n",
       " '狂': 377,\n",
       " '镜头': 378,\n",
       " '运用': 379,\n",
       " '笑': 380,\n",
       " '点': 381,\n",
       " '插入': 382,\n",
       " '好莱坞': 383,\n",
       " '爆米花': 384,\n",
       " '功': 385,\n",
       " '不过': 386,\n",
       " '从头': 387,\n",
       " '打到': 388,\n",
       " '尾': 389,\n",
       " '拼': 390,\n",
       " '虽然': 391,\n",
       " '有略': 392,\n",
       " '乱': 393,\n",
       " '时': 394,\n",
       " '因为': 395,\n",
       " '没': 396,\n",
       " '啥': 397,\n",
       " '期望值': 398,\n",
       " '被': 399,\n",
       " '吓了一跳': 400,\n",
       " '吴刚': 401,\n",
       " '谦和': 402,\n",
       " '丁海峰': 403,\n",
       " '老': 404,\n",
       " '三位': 405,\n",
       " '炖': 406,\n",
       " '烂熟': 407,\n",
       " '牛筋': 408,\n",
       " '嚼': 409,\n",
       " '用心': 410,\n",
       " '啊': 411,\n",
       " '导演': 412,\n",
       " '小看': 413,\n",
       " '确实': 414,\n",
       " '下功夫': 415,\n",
       " '拉': 416,\n",
       " '借鉴': 417,\n",
       " '至于': 418,\n",
       " '大家': 419,\n",
       " '比较': 420,\n",
       " '反感': 421,\n",
       " '情绪': 422,\n",
       " '那些': 423,\n",
       " '桥段': 424,\n",
       " '必备': 425,\n",
       " '稍微': 426,\n",
       " '一点': 427,\n",
       " '还': 428,\n",
       " '可以': 429,\n",
       " '接受': 430,\n",
       " '最好': 431,\n",
       " '地方': 432,\n",
       " '掌握': 433,\n",
       " '张弛': 434,\n",
       " '有度': 435,\n",
       " '这点': 436,\n",
       " '难得': 437,\n",
       " '一直': 438,\n",
       " '脑子里': 439,\n",
       " '回响': 440,\n",
       " '片头': 441,\n",
       " '海里': 442,\n",
       " '那场': 443,\n",
       " '完': 444,\n",
       " '呆': 445,\n",
       " '下去': 446,\n",
       " '太假': 447,\n",
       " '提前': 448,\n",
       " '离场': 449,\n",
       " '好看': 450,\n",
       " '演技': 451,\n",
       " '棒': 452,\n",
       " '符合': 453,\n",
       " '反而': 454,\n",
       " '更': 455,\n",
       " '差': 456,\n",
       " '这一': 457,\n",
       " '放之四海而皆准': 458,\n",
       " '规律': 459,\n",
       " '场面': 460,\n",
       " '越做越': 461,\n",
       " '然而': 462,\n",
       " '伴随': 463,\n",
       " '特效': 464,\n",
       " '升级': 465,\n",
       " '叙事': 466,\n",
       " '变得': 467,\n",
       " '非常': 468,\n",
       " '凌乱': 469,\n",
       " '格局': 470,\n",
       " '颇': 471,\n",
       " '拍成': 472,\n",
       " '黑鹰坠落': 473,\n",
       " '结果': 474,\n",
       " '撑': 475,\n",
       " '死': 476,\n",
       " '最多': 477,\n",
       " '只是': 478,\n",
       " '官方': 479,\n",
       " '版': 480,\n",
       " '敢死队': 481,\n",
       " '但论': 482,\n",
       " '自我': 483,\n",
       " '角色定位': 484,\n",
       " '能力': 485,\n",
       " '远': 486,\n",
       " '如同': 487,\n",
       " '演员': 488,\n",
       " '出身': 489,\n",
       " '甄子丹': 490,\n",
       " '喜欢': 491,\n",
       " '不是': 492,\n",
       " '装傻': 493,\n",
       " '真傻': 494,\n",
       " '要不是': 495,\n",
       " '真的': 496,\n",
       " '别的': 497,\n",
       " '可': 498,\n",
       " '肯定': 499,\n",
       " '选': 500,\n",
       " '直': 501,\n",
       " '男': 502,\n",
       " '癌': 503,\n",
       " '令人发指': 504,\n",
       " '所有': 505,\n",
       " '剧情': 506,\n",
       " '走向': 507,\n",
       " '九十年代': 508,\n",
       " '那套': 509,\n",
       " '照搬': 510,\n",
       " '审美': 511,\n",
       " '事儿': 512,\n",
       " '一时': 513,\n",
       " '会儿': 514,\n",
       " '培养': 515,\n",
       " '出来': 516,\n",
       " '整部': 517,\n",
       " '延续': 518,\n",
       " '风格': 519,\n",
       " '热血': 520,\n",
       " '要': 521,\n",
       " '不错': 522,\n",
       " '适合': 523,\n",
       " '演': 524,\n",
       " '军人': 525,\n",
       " '之前': 526,\n",
       " '片段': 527,\n",
       " '念': 528,\n",
       " '劲儿': 529,\n",
       " '来说': 530,\n",
       " '张翰': 531,\n",
       " '一': 532,\n",
       " '一股': 533,\n",
       " '雷阵雨': 534,\n",
       " '画风': 535,\n",
       " '目': 536,\n",
       " '瞪': 537,\n",
       " '狗': 538,\n",
       " '瘠薄': 539,\n",
       " '人牛': 540,\n",
       " 'b': 541,\n",
       " '硬道理': 542,\n",
       " '隔壁': 543,\n",
       " '建军': 544,\n",
       " '大爷': 545,\n",
       " '你们': 546,\n",
       " '场景': 547,\n",
       " '战斗': 548,\n",
       " '全线': 549,\n",
       " '打斗': 550,\n",
       " '游走': 551,\n",
       " '审查': 552,\n",
       " '红线': 553,\n",
       " '边界': 554,\n",
       " '政治': 555,\n",
       " '安全': 556,\n",
       " '缝隙': 557,\n",
       " '部': 558,\n",
       " '极具': 559,\n",
       " '煽动': 560,\n",
       " '大片': 561,\n",
       " '制作': 562,\n",
       " '精良': 563,\n",
       " '影片': 564,\n",
       " '请': 565,\n",
       " '多来': 566,\n",
       " '胶卷': 567,\n",
       " '过度': 568,\n",
       " '部队': 569,\n",
       " '没太多': 570,\n",
       " '展示': 571,\n",
       " '死去': 572,\n",
       " '反正': 573,\n",
       " '吸引': 574,\n",
       " '冲': 575,\n",
       " '为什么': 576,\n",
       " '鄙视': 577,\n",
       " '敢': 578,\n",
       " '开拓': 579,\n",
       " '允许': 580,\n",
       " '他们': 581,\n",
       " '再': 582,\n",
       " '直到': 583,\n",
       " '更好': 584,\n",
       " '拍出': 585,\n",
       " '出彩': 586,\n",
       " '呢': 587,\n",
       " '火爆': 588,\n",
       " '本片': 589,\n",
       " '必将': 590,\n",
       " '燃爆': 591,\n",
       " '暑期': 592,\n",
       " '厉害': 593,\n",
       " '身为': 594,\n",
       " '武打': 595,\n",
       " '高标准': 596,\n",
       " '枪战': 597,\n",
       " '为': 598,\n",
       " '点赞': 599,\n",
       " '热血男儿': 600,\n",
       " '荷尔蒙': 601,\n",
       " '爆发': 602,\n",
       " '给': 603,\n",
       " '星': 604,\n",
       " '血战': 605,\n",
       " '钢锯': 606,\n",
       " '岭': 607,\n",
       " '会': 608,\n",
       " '歌颂': 609,\n",
       " '宗教': 610,\n",
       " '情怀': 611,\n",
       " '超越': 612,\n",
       " '政权': 613,\n",
       " '当': 614,\n",
       " '只': 615,\n",
       " '明显': 616,\n",
       " '低': 617,\n",
       " '层次': 618,\n",
       " '充满': 619,\n",
       " '现实': 620,\n",
       " '乃至': 621,\n",
       " '投机': 622,\n",
       " '考量': 623,\n",
       " '高下': 624,\n",
       " '立': 625,\n",
       " '见': 626,\n",
       " '请问': 627,\n",
       " '脑': 628,\n",
       " '残': 629,\n",
       " '火箭炮': 630,\n",
       " '吗': 631,\n",
       " '傲气': 632,\n",
       " '雄鹰': 633,\n",
       " '第一': 634,\n",
       " '滴血': 635,\n",
       " '算是': 636,\n",
       " '国内': 637,\n",
       " '准': 638,\n",
       " '钱': 639,\n",
       " '花': 640,\n",
       " '有效': 641,\n",
       " '气魄': 642,\n",
       " '创作': 643,\n",
       " '足够': 644,\n",
       " '真诚': 645,\n",
       " '人物': 646,\n",
       " '连': 647,\n",
       " '可爱': 648,\n",
       " '如果': 649,\n",
       " '当年': 650,\n",
       " '那样': 651,\n",
       " '膨胀': 652,\n",
       " '银幕': 653,\n",
       " '独占': 654,\n",
       " '聚光灯': 655,\n",
       " '走': 656,\n",
       " '扪心自问': 657,\n",
       " '没法': 658,\n",
       " '评价': 659,\n",
       " '全片': 660,\n",
       " '靠': 661,\n",
       " '文戏': 662,\n",
       " '扯淡': 663,\n",
       " '女主角': 664,\n",
       " '毫无': 665,\n",
       " '必要': 666,\n",
       " '只要': 667,\n",
       " '开挂': 668,\n",
       " '牛': 669,\n",
       " '逼': 670,\n",
       " '之处': 671,\n",
       " '在于': 672,\n",
       " '透露': 673,\n",
       " '极': 674,\n",
       " '强烈': 675,\n",
       " '意识形态': 676,\n",
       " '枷锁': 677,\n",
       " '祖国': 678,\n",
       " '面前': 679,\n",
       " '一切': 680,\n",
       " '反动派': 681,\n",
       " '纸老虎': 682,\n",
       " '人开': 683,\n",
       " '挂': 684,\n",
       " '团灭': 685,\n",
       " '合情合理': 686,\n",
       " '两星': 687,\n",
       " '鼓励': 688,\n",
       " '其他': 689,\n",
       " '般': 690,\n",
       " '看点': 691,\n",
       " '有点': 692,\n",
       " '手接': 693,\n",
       " '哈哈哈': 694,\n",
       " '从': 695,\n",
       " '之后': 696,\n",
       " '炸': 697,\n",
       " '翻': 698,\n",
       " '一下': 699,\n",
       " '四星': 700,\n",
       " '当时': 701,\n",
       " '其实': 702,\n",
       " '完成度': 703,\n",
       " '接近': 704,\n",
       " '每个': 705,\n",
       " '步骤': 706,\n",
       " '顺滑': 707,\n",
       " '任何': 708,\n",
       " '出人意料': 709,\n",
       " '是因为': 710,\n",
       " '看看': 711,\n",
       " '最近': 712,\n",
       " '世界': 713,\n",
       " '抱歉': 714,\n",
       " '影院': 715,\n",
       " '起来': 716,\n",
       " '魔幻': 717,\n",
       " '当然': 718,\n",
       " '强拆': 719,\n",
       " '现实感': 720,\n",
       " '一幕': 721,\n",
       " '开场': 722,\n",
       " '搏斗': 723,\n",
       " '从来': 724,\n",
       " '其它': 725,\n",
       " '拍摄': 726,\n",
       " '难度': 727,\n",
       " '同时': 728,\n",
       " '技能': 729,\n",
       " '方面': 730,\n",
       " '要求': 731,\n",
       " '回来': 732,\n",
       " '搜': 733,\n",
       " '游泳': 734,\n",
       " '潜水': 735,\n",
       " '滑雪': 736,\n",
       " '飞机': 737,\n",
       " '射击': 738,\n",
       " '各项': 739,\n",
       " '特意': 740,\n",
       " '特种部队': 741,\n",
       " '当过': 742,\n",
       " '月': 743,\n",
       " '兵': 744,\n",
       " '佩服': 745,\n",
       " '这样': 746,\n",
       " '星半': 747,\n",
       " '结束': 748,\n",
       " '掌声': 749,\n",
       " '出现': 750,\n",
       " '近期': 751,\n",
       " '少见': 752,\n",
       " '一粒': 753,\n",
       " '大补丸': 754,\n",
       " '有人': 755,\n",
       " '吃': 756,\n",
       " '开心': 757,\n",
       " '补大': 758,\n",
       " '从白': 759,\n",
       " '黑': 760,\n",
       " '字幕': 761,\n",
       " '展现': 762,\n",
       " '超级': 763,\n",
       " '糙': 764,\n",
       " '猛': 765,\n",
       " '媲美': 766,\n",
       " '终结者': 767,\n",
       " '无': 768,\n",
       " '亮点': 769,\n",
       " '变': 770,\n",
       " '谐星': 771,\n",
       " '掌控': 772,\n",
       " '逼近': 773,\n",
       " '不住': 774,\n",
       " '边缘': 775,\n",
       " '带': 776,\n",
       " '感': 777,\n",
       " '拳拳': 778,\n",
       " '肉': 779,\n",
       " '超爽': 780,\n",
       " '聪明': 781,\n",
       " '鸡': 782,\n",
       " '贼': 783,\n",
       " '一面': 784,\n",
       " '旗下': 785,\n",
       " '呈现': 786,\n",
       " '一出': 787,\n",
       " '重工业': 788,\n",
       " '娱乐': 789,\n",
       " '调控': 790,\n",
       " '说教': 791,\n",
       " '比例': 792,\n",
       " '尺度': 793,\n",
       " '大众': 794,\n",
       " '接纳': 795,\n",
       " '把握': 796,\n",
       " '微妙': 797,\n",
       " '其中': 798,\n",
       " '一些': 799,\n",
       " '奇侠': 800,\n",
       " '化': 801,\n",
       " '内容': 802,\n",
       " '比如': 803,\n",
       " '玻璃': 804,\n",
       " '碴': 805,\n",
       " '子': 806,\n",
       " '飞镖': 807,\n",
       " '杀敌': 808,\n",
       " '一类': 809,\n",
       " '只不过': 810,\n",
       " '遮盖': 811,\n",
       " '掉': 812,\n",
       " '老爹': 813,\n",
       " '演过': 814,\n",
       " '美剧': 815,\n",
       " '搏击': 816,\n",
       " '王国': 817,\n",
       " '力荐': 818,\n",
       " '那部': 819,\n",
       " '为啥': 820,\n",
       " '奇异': 821,\n",
       " '恩典': 822,\n",
       " '配乐': 823,\n",
       " '画': 824,\n",
       " '内': 825,\n",
       " '男生': 826,\n",
       " '的话': 827,\n",
       " '应该': 828,\n",
       " '刺激': 829,\n",
       " '肾上腺素': 830,\n",
       " '女生': 831,\n",
       " '对龙': 832,\n",
       " '小云': 833,\n",
       " '感情': 834,\n",
       " '十分': 835,\n",
       " '打动': 836,\n",
       " '模仿': 837,\n",
       " '许多': 838,\n",
       " '怎么': 839,\n",
       " '玩': 840,\n",
       " '一股脑': 841,\n",
       " '堆': 842,\n",
       " '槽': 843,\n",
       " '几位': 844,\n",
       " '血': 845,\n",
       " '厚到': 846,\n",
       " '科幻': 847,\n",
       " '级别': 848,\n",
       " '重复': 849,\n",
       " '满血': 850,\n",
       " '红血': 851,\n",
       " '中毒': 852,\n",
       " '极速': 853,\n",
       " '回血': 854,\n",
       " '爆': 855,\n",
       " '种': 856,\n",
       " '打通': 857,\n",
       " '全场': 858,\n",
       " '太过': 859,\n",
       " '投机取巧': 860,\n",
       " '穿': 861,\n",
       " '迈克尔': 862,\n",
       " '贝都': 863,\n",
       " '不受': 864,\n",
       " '待见': 865,\n",
       " '国片': 866,\n",
       " '前仆后继': 867,\n",
       " '爆炸': 868,\n",
       " 'high': 869,\n",
       " '瞎': 870,\n",
       " '没用': 871,\n",
       " '女人': 872,\n",
       " '缺': 873,\n",
       " '男人': 874,\n",
       " '征服': 875,\n",
       " '美国': 876,\n",
       " '不行': 877,\n",
       " '全都': 878,\n",
       " '跟': 879,\n",
       " '跳墙': 880,\n",
       " '一样': 881,\n",
       " '拯救': 882,\n",
       " '国产片': 883,\n",
       " '以': 884,\n",
       " '中印': 885,\n",
       " '局势': 886,\n",
       " '对比': 887,\n",
       " '假想': 888,\n",
       " '真是': 889,\n",
       " '讽刺': 890,\n",
       " '谄媚': 891,\n",
       " '军旅': 892,\n",
       " '题材': 893,\n",
       " '质感': 894,\n",
       " '国外': 895,\n",
       " '精彩': 896,\n",
       " '看着': 897,\n",
       " '有力': 898,\n",
       " '必须': 899,\n",
       " '安利': 900,\n",
       " '张': 901,\n",
       " '翰': 902,\n",
       " '简直': 903,\n",
       " '承包': 904,\n",
       " '笑点': 905,\n",
       " '量身定做': 906,\n",
       " '彭于': 907,\n",
       " '晏': 908,\n",
       " '可演': 909,\n",
       " '不来': 910,\n",
       " '不少': 911,\n",
       " '漂移': 912,\n",
       " '无人机': 913,\n",
       " '突袭': 914,\n",
       " '直升机': 915,\n",
       " '坠': 916,\n",
       " '露': 917,\n",
       " '肉搏': 918,\n",
       " '军舰': 919,\n",
       " '发射': 920,\n",
       " '叛乱': 921,\n",
       " '国际化': 922,\n",
       " '视角': 923,\n",
       " '标配': 924,\n",
       " '饰演': 925,\n",
       " '深入人心': 926,\n",
       " '搏命': 927,\n",
       " '精神': 928,\n",
       " '当下': 929,\n",
       " '第三部': 930,\n",
       " '表白': 931,\n",
       " '典型': 932,\n",
       " '方式': 933,\n",
       " '每次': 934,\n",
       " '猜': 935,\n",
       " '诶': 936,\n",
       " '问': 937,\n",
       " '王牌': 938,\n",
       " '特工': 939,\n",
       " '那么': 940,\n",
       " '杀人': 941,\n",
       " '经过': 942,\n",
       " '艺术': 943,\n",
       " '处理': 944,\n",
       " '直接': 945,\n",
       " '删': 946,\n",
       " '血腥': 947,\n",
       " '屠杀': 948,\n",
       " '赤裸裸': 949,\n",
       " '大段': 950,\n",
       " '正确': 951,\n",
       " '庇': 952,\n",
       " '衣': 953,\n",
       " '意料之中': 954,\n",
       " '意料之外': 955,\n",
       " '惊喜': 956,\n",
       " '属于': 957,\n",
       " '狼性': 958,\n",
       " '军魂': 959,\n",
       " '几个': 960,\n",
       " '网红': 961,\n",
       " '弹弹琴': 962,\n",
       " '大国': 963,\n",
       " '气象': 964,\n",
       " '满屏': 965,\n",
       " '告诉': 966,\n",
       " '吴': 967,\n",
       " '迪塞尔': 968,\n",
       " '如入无人之境': 969,\n",
       " '亿': 970,\n",
       " '大陆': 971,\n",
       " '一刻': 972,\n",
       " '集体': 973,\n",
       " '勃起': 974,\n",
       " '离开': 975,\n",
       " '影厅': 976,\n",
       " '屌丝': 977,\n",
       " '同样': 978,\n",
       " '开始': 979,\n",
       " '前': 980,\n",
       " '屌': 981,\n",
       " '一万倍': 982,\n",
       " '一次': 983,\n",
       " '标准': 984,\n",
       " '打造': 985,\n",
       " '美式': 986,\n",
       " '不可逆转': 987,\n",
       " '缺点': 988,\n",
       " '笑料': 989,\n",
       " '一定': 990,\n",
       " '程度': 991,\n",
       " '地': 992,\n",
       " '破坏': 993,\n",
       " '节奏感': 994,\n",
       " '斥': 995,\n",
       " '巨资': 996,\n",
       " '炮制': 997,\n",
       " '有所': 998,\n",
       " '体验': 999,\n",
       " ...}"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "word2index = {'unkown':0}\n",
    "for word,_ in vocab.items():\n",
    "    word2index[word] = len(word2index)\n",
    "word2index    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{0: 'unkown',\n",
       " 1: '吴京',\n",
       " 2: '意淫',\n",
       " 3: '到',\n",
       " 4: '了',\n",
       " 5: '脑残',\n",
       " 6: '的',\n",
       " 7: '地步',\n",
       " 8: '看',\n",
       " 9: '恶心',\n",
       " 10: '想',\n",
       " 11: '吐',\n",
       " 12: '首映礼',\n",
       " 13: '太',\n",
       " 14: '恐怖',\n",
       " 15: '这个',\n",
       " 16: '电影',\n",
       " 17: '不讲道理',\n",
       " 18: '完全',\n",
       " 19: '就是',\n",
       " 20: '在',\n",
       " 21: '实现',\n",
       " 22: '他',\n",
       " 23: '小',\n",
       " 24: '粉红',\n",
       " 25: '英雄',\n",
       " 26: '梦',\n",
       " 27: '各种',\n",
       " 28: '装备',\n",
       " 29: '轮番',\n",
       " 30: '上场',\n",
       " 31: '视',\n",
       " 32: '物理',\n",
       " 33: '逻辑',\n",
       " 34: '于',\n",
       " 35: '不顾',\n",
       " 36: '不得不',\n",
       " 37: '说',\n",
       " 38: '有钱',\n",
       " 39: '真',\n",
       " 40: '好',\n",
       " 41: '随意',\n",
       " 42: '胡闹',\n",
       " 43: '炒作',\n",
       " 44: '水平',\n",
       " 45: '不输',\n",
       " 46: '冯小刚',\n",
       " 47: '但',\n",
       " 48: '小刚',\n",
       " 49: '至少',\n",
       " 50: '不会',\n",
       " 51: '用',\n",
       " 52: '主旋律',\n",
       " 53: '来',\n",
       " 54: '让',\n",
       " 55: '人',\n",
       " 56: '不',\n",
       " 57: '舒服',\n",
       " 58: '为了',\n",
       " 59: '而',\n",
       " 60: '煽情',\n",
       " 61: '觉得',\n",
       " 62: '是',\n",
       " 63: '个',\n",
       " 64: '大',\n",
       " 65: '做作',\n",
       " 66: '谎言',\n",
       " 67: '家',\n",
       " 68: '更新',\n",
       " 69: '片子',\n",
       " 70: '整体',\n",
       " 71: '不如',\n",
       " 72: '湄公河',\n",
       " 73: '行动',\n",
       " 74: '不够',\n",
       " 75: '流畅',\n",
       " 76: '编剧',\n",
       " 77: '有毒',\n",
       " 78: '台词',\n",
       " 79: '尴尬',\n",
       " 80: '刻意',\n",
       " 81: '显得',\n",
       " 82: '如此',\n",
       " 83: '不合时宜',\n",
       " 84: '又',\n",
       " 85: '多余',\n",
       " 86: '凭良心说',\n",
       " 87: '看到',\n",
       " 88: '不像',\n",
       " 89: '战狼',\n",
       " 90: '续集',\n",
       " 91: '完虐',\n",
       " 92: '中二',\n",
       " 93: '得',\n",
       " 94: '很',\n",
       " 95: '犯',\n",
       " 96: '我',\n",
       " 97: '中华',\n",
       " 98: '者',\n",
       " 99: '虽远',\n",
       " 100: '必',\n",
       " 101: '诛',\n",
       " 102: '比',\n",
       " 103: '这句',\n",
       " 104: '话',\n",
       " 105: '还要',\n",
       " 106: '一百倍',\n",
       " 107: '脑子',\n",
       " 108: '东西',\n",
       " 109: '希望',\n",
       " 110: '们',\n",
       " 111: '都',\n",
       " 112: '能',\n",
       " 113: '有',\n",
       " 114: '三星',\n",
       " 115: '半',\n",
       " 116: '实打实',\n",
       " 117: '分',\n",
       " 118: '第一集',\n",
       " 119: '爱国',\n",
       " 120: '内部',\n",
       " 121: '做',\n",
       " 122: '着',\n",
       " 123: '置换',\n",
       " 124: '与',\n",
       " 125: '较劲',\n",
       " 126: '第二集',\n",
       " 127: '才',\n",
       " 128: '真正',\n",
       " 129: '显露',\n",
       " 130: '野心',\n",
       " 131: '终于',\n",
       " 132: '抛弃',\n",
       " 133: '李忠志',\n",
       " 134: '新增',\n",
       " 135: '外来',\n",
       " 136: '班底',\n",
       " 137: '硬件',\n",
       " 138: '实力',\n",
       " 139: '机会',\n",
       " 140: '和',\n",
       " 141: '国际',\n",
       " 142: '接轨',\n",
       " 143: '开篇',\n",
       " 144: '水下',\n",
       " 145: '长镜头',\n",
       " 146: '诸如',\n",
       " 147: '铁丝网',\n",
       " 148: '拦截',\n",
       " 149: '弹头',\n",
       " 150: '细节',\n",
       " 151: '设计',\n",
       " 152: '国产',\n",
       " 153: '动作片',\n",
       " 154: '重新',\n",
       " 155: '封顶',\n",
       " 156: '理念',\n",
       " 157: '上',\n",
       " 158: '它',\n",
       " 159: '甚至',\n",
       " 160: '做到',\n",
       " 161: '绣春刀',\n",
       " 162: '最',\n",
       " 163: '那',\n",
       " 164: '部分',\n",
       " 165: '惊险',\n",
       " 166: '大气',\n",
       " 167: '引人入胜',\n",
       " 168: '结合',\n",
       " 169: '不俗',\n",
       " 170: '快',\n",
       " 171: '剪下',\n",
       " 172: '真刀真枪',\n",
       " 173: '不禁',\n",
       " 174: '热血沸腾',\n",
       " 175: '特别',\n",
       " 176: '弹簧床',\n",
       " 177: '架',\n",
       " 178: '挡',\n",
       " 179: '炸弹',\n",
       " 180: '空手',\n",
       " 181: '接',\n",
       " 182: '碎玻璃',\n",
       " 183: '弹匣',\n",
       " 184: '割喉',\n",
       " 185: '等',\n",
       " 186: '帅',\n",
       " 187: '飞起',\n",
       " 188: '就算',\n",
       " 189: '前半段',\n",
       " 190: '铺垫',\n",
       " 191: '节奏',\n",
       " 192: '散漫',\n",
       " 193: '主角',\n",
       " 194: '光环',\n",
       " 195: '开',\n",
       " 196: '太大',\n",
       " 197: '也',\n",
       " 198: '不怕',\n",
       " 199: '作为',\n",
       " 200: '一个',\n",
       " 201: '中国',\n",
       " 202: '两个',\n",
       " 203: '小时',\n",
       " 204: '弥漫着',\n",
       " 205: '强大',\n",
       " 206: '不可',\n",
       " 207: '侵犯',\n",
       " 208: '氛围',\n",
       " 209: '还是',\n",
       " 210: '那颗',\n",
       " 211: '民族',\n",
       " 212: '自豪',\n",
       " 213: '心',\n",
       " 214: '砰砰',\n",
       " 215: '砰',\n",
       " 216: '跳',\n",
       " 217: '不停',\n",
       " 218: '冷峰',\n",
       " 219: '这部',\n",
       " 220: '里',\n",
       " 221: '即',\n",
       " 222: '像',\n",
       " 223: '成龙',\n",
       " 224: '杰',\n",
       " 225: '森斯坦',\n",
       " 226: '森',\n",
       " 227: '体制',\n",
       " 228: '外',\n",
       " 229: '同',\n",
       " 230: '类型',\n",
       " 231: '总是',\n",
       " 232: '代表',\n",
       " 233: '个人',\n",
       " 234: '无能',\n",
       " 235: '政府',\n",
       " 236: '需要',\n",
       " 237: '求助于',\n",
       " 238: '这些',\n",
       " 239: '才能',\n",
       " 240: '解决',\n",
       " 241: '难题',\n",
       " 242: '体现',\n",
       " 243: '价值',\n",
       " 244: '所以',\n",
       " 245: '照抄',\n",
       " 246: '这种',\n",
       " 247: '模式',\n",
       " 248: '实际上',\n",
       " 249: '问题',\n",
       " 250: '我们',\n",
       " 251: '以前',\n",
       " 252: '嘲笑',\n",
       " 253: '英雄主义',\n",
       " 254: '却',\n",
       " 255: '没想到',\n",
       " 256: '捆绑',\n",
       " 257: '爱国主义',\n",
       " 258: '全能',\n",
       " 259: '战士',\n",
       " 260: '更加',\n",
       " 261: '难以',\n",
       " 262: '下咽',\n",
       " 263: '多',\n",
       " 264: '无脑',\n",
       " 265: '信',\n",
       " 266: '戏',\n",
       " 267: '对',\n",
       " 268: '路',\n",
       " 269: '转',\n",
       " 270: '粉',\n",
       " 271: '最后',\n",
       " 272: '彩蛋',\n",
       " 273: '没有',\n",
       " 274: '理由',\n",
       " 275: '期待',\n",
       " 276: '下',\n",
       " 277: '一部',\n",
       " 278: '假',\n",
       " 279: '嗨',\n",
       " 280: '几处',\n",
       " 281: '情节',\n",
       " 282: '设置',\n",
       " 283: '过于',\n",
       " 284: '彰显',\n",
       " 285: '国家',\n",
       " 286: '自豪感',\n",
       " 287: '稍显',\n",
       " 288: '突兀',\n",
       " 289: '爽',\n",
       " 290: '片',\n",
       " 291: '打戏',\n",
       " 292: '挺',\n",
       " 293: '燃',\n",
       " 294: '但是',\n",
       " 295: '故事',\n",
       " 296: '一般',\n",
       " 297: '达康',\n",
       " 298: '书记',\n",
       " 299: '合适',\n",
       " 300: '角色',\n",
       " 301: '赵',\n",
       " 302: '东来',\n",
       " 303: '倒',\n",
       " 304: '张瀚',\n",
       " 305: '太太',\n",
       " 306: '违',\n",
       " 307: '分钟',\n",
       " 308: '穿越',\n",
       " 309: '回',\n",
       " 310: '偶像剧',\n",
       " 311: '接到',\n",
       " 312: '非洲',\n",
       " 313: '卧底',\n",
       " 314: '冷锋',\n",
       " 315: '报告',\n",
       " 316: '丁义珍',\n",
       " 317: '现在',\n",
       " 318: '请求',\n",
       " 319: '抓捕',\n",
       " 320: '李达康',\n",
       " 321: '这件',\n",
       " 322: '事先',\n",
       " 323: '不要',\n",
       " 324: '声张',\n",
       " 325: '别',\n",
       " 326: '省厅',\n",
       " 327: '知道',\n",
       " 328: '就',\n",
       " 329: '你',\n",
       " 330: '一起',\n",
       " 331: '去',\n",
       " 332: '加上',\n",
       " 333: '同志',\n",
       " 334: '三人',\n",
       " 335: '逮捕',\n",
       " 336: '这次',\n",
       " 337: '行',\n",
       " 338: '叫',\n",
       " 339: '吧',\n",
       " 340: '拍',\n",
       " 341: '喜剧',\n",
       " 342: '整个',\n",
       " 343: '感觉',\n",
       " 344: '搞笑',\n",
       " 345: '这么',\n",
       " 346: '打',\n",
       " 347: '过',\n",
       " 348: '徐晓冬',\n",
       " 349: '么',\n",
       " 350: '往',\n",
       " 351: '一处',\n",
       " 352: '劲',\n",
       " 353: '使',\n",
       " 354: '梦想',\n",
       " 355: '看吧',\n",
       " 356: '第一部',\n",
       " 357: '好太多',\n",
       " 358: '谢谢',\n",
       " 359: '美',\n",
       " 360: '队',\n",
       " 361: '动作',\n",
       " 362: '指导',\n",
       " 363: '这',\n",
       " 364: '火',\n",
       " 365: '没见识',\n",
       " 366: '开头',\n",
       " 367: '长',\n",
       " 368: '对决',\n",
       " 369: '可算',\n",
       " 370: '华语',\n",
       " 371: '顶尖',\n",
       " 372: '存在',\n",
       " 373: '驱逐舰',\n",
       " 374: '导弹',\n",
       " 375: '坦克',\n",
       " 376: '商业片',\n",
       " 377: '狂',\n",
       " 378: '镜头',\n",
       " 379: '运用',\n",
       " 380: '笑',\n",
       " 381: '点',\n",
       " 382: '插入',\n",
       " 383: '好莱坞',\n",
       " 384: '爆米花',\n",
       " 385: '功',\n",
       " 386: '不过',\n",
       " 387: '从头',\n",
       " 388: '打到',\n",
       " 389: '尾',\n",
       " 390: '拼',\n",
       " 391: '虽然',\n",
       " 392: '有略',\n",
       " 393: '乱',\n",
       " 394: '时',\n",
       " 395: '因为',\n",
       " 396: '没',\n",
       " 397: '啥',\n",
       " 398: '期望值',\n",
       " 399: '被',\n",
       " 400: '吓了一跳',\n",
       " 401: '吴刚',\n",
       " 402: '谦和',\n",
       " 403: '丁海峰',\n",
       " 404: '老',\n",
       " 405: '三位',\n",
       " 406: '炖',\n",
       " 407: '烂熟',\n",
       " 408: '牛筋',\n",
       " 409: '嚼',\n",
       " 410: '用心',\n",
       " 411: '啊',\n",
       " 412: '导演',\n",
       " 413: '小看',\n",
       " 414: '确实',\n",
       " 415: '下功夫',\n",
       " 416: '拉',\n",
       " 417: '借鉴',\n",
       " 418: '至于',\n",
       " 419: '大家',\n",
       " 420: '比较',\n",
       " 421: '反感',\n",
       " 422: '情绪',\n",
       " 423: '那些',\n",
       " 424: '桥段',\n",
       " 425: '必备',\n",
       " 426: '稍微',\n",
       " 427: '一点',\n",
       " 428: '还',\n",
       " 429: '可以',\n",
       " 430: '接受',\n",
       " 431: '最好',\n",
       " 432: '地方',\n",
       " 433: '掌握',\n",
       " 434: '张弛',\n",
       " 435: '有度',\n",
       " 436: '这点',\n",
       " 437: '难得',\n",
       " 438: '一直',\n",
       " 439: '脑子里',\n",
       " 440: '回响',\n",
       " 441: '片头',\n",
       " 442: '海里',\n",
       " 443: '那场',\n",
       " 444: '完',\n",
       " 445: '呆',\n",
       " 446: '下去',\n",
       " 447: '太假',\n",
       " 448: '提前',\n",
       " 449: '离场',\n",
       " 450: '好看',\n",
       " 451: '演技',\n",
       " 452: '棒',\n",
       " 453: '符合',\n",
       " 454: '反而',\n",
       " 455: '更',\n",
       " 456: '差',\n",
       " 457: '这一',\n",
       " 458: '放之四海而皆准',\n",
       " 459: '规律',\n",
       " 460: '场面',\n",
       " 461: '越做越',\n",
       " 462: '然而',\n",
       " 463: '伴随',\n",
       " 464: '特效',\n",
       " 465: '升级',\n",
       " 466: '叙事',\n",
       " 467: '变得',\n",
       " 468: '非常',\n",
       " 469: '凌乱',\n",
       " 470: '格局',\n",
       " 471: '颇',\n",
       " 472: '拍成',\n",
       " 473: '黑鹰坠落',\n",
       " 474: '结果',\n",
       " 475: '撑',\n",
       " 476: '死',\n",
       " 477: '最多',\n",
       " 478: '只是',\n",
       " 479: '官方',\n",
       " 480: '版',\n",
       " 481: '敢死队',\n",
       " 482: '但论',\n",
       " 483: '自我',\n",
       " 484: '角色定位',\n",
       " 485: '能力',\n",
       " 486: '远',\n",
       " 487: '如同',\n",
       " 488: '演员',\n",
       " 489: '出身',\n",
       " 490: '甄子丹',\n",
       " 491: '喜欢',\n",
       " 492: '不是',\n",
       " 493: '装傻',\n",
       " 494: '真傻',\n",
       " 495: '要不是',\n",
       " 496: '真的',\n",
       " 497: '别的',\n",
       " 498: '可',\n",
       " 499: '肯定',\n",
       " 500: '选',\n",
       " 501: '直',\n",
       " 502: '男',\n",
       " 503: '癌',\n",
       " 504: '令人发指',\n",
       " 505: '所有',\n",
       " 506: '剧情',\n",
       " 507: '走向',\n",
       " 508: '九十年代',\n",
       " 509: '那套',\n",
       " 510: '照搬',\n",
       " 511: '审美',\n",
       " 512: '事儿',\n",
       " 513: '一时',\n",
       " 514: '会儿',\n",
       " 515: '培养',\n",
       " 516: '出来',\n",
       " 517: '整部',\n",
       " 518: '延续',\n",
       " 519: '风格',\n",
       " 520: '热血',\n",
       " 521: '要',\n",
       " 522: '不错',\n",
       " 523: '适合',\n",
       " 524: '演',\n",
       " 525: '军人',\n",
       " 526: '之前',\n",
       " 527: '片段',\n",
       " 528: '念',\n",
       " 529: '劲儿',\n",
       " 530: '来说',\n",
       " 531: '张翰',\n",
       " 532: '一',\n",
       " 533: '一股',\n",
       " 534: '雷阵雨',\n",
       " 535: '画风',\n",
       " 536: '目',\n",
       " 537: '瞪',\n",
       " 538: '狗',\n",
       " 539: '瘠薄',\n",
       " 540: '人牛',\n",
       " 541: 'b',\n",
       " 542: '硬道理',\n",
       " 543: '隔壁',\n",
       " 544: '建军',\n",
       " 545: '大爷',\n",
       " 546: '你们',\n",
       " 547: '场景',\n",
       " 548: '战斗',\n",
       " 549: '全线',\n",
       " 550: '打斗',\n",
       " 551: '游走',\n",
       " 552: '审查',\n",
       " 553: '红线',\n",
       " 554: '边界',\n",
       " 555: '政治',\n",
       " 556: '安全',\n",
       " 557: '缝隙',\n",
       " 558: '部',\n",
       " 559: '极具',\n",
       " 560: '煽动',\n",
       " 561: '大片',\n",
       " 562: '制作',\n",
       " 563: '精良',\n",
       " 564: '影片',\n",
       " 565: '请',\n",
       " 566: '多来',\n",
       " 567: '胶卷',\n",
       " 568: '过度',\n",
       " 569: '部队',\n",
       " 570: '没太多',\n",
       " 571: '展示',\n",
       " 572: '死去',\n",
       " 573: '反正',\n",
       " 574: '吸引',\n",
       " 575: '冲',\n",
       " 576: '为什么',\n",
       " 577: '鄙视',\n",
       " 578: '敢',\n",
       " 579: '开拓',\n",
       " 580: '允许',\n",
       " 581: '他们',\n",
       " 582: '再',\n",
       " 583: '直到',\n",
       " 584: '更好',\n",
       " 585: '拍出',\n",
       " 586: '出彩',\n",
       " 587: '呢',\n",
       " 588: '火爆',\n",
       " 589: '本片',\n",
       " 590: '必将',\n",
       " 591: '燃爆',\n",
       " 592: '暑期',\n",
       " 593: '厉害',\n",
       " 594: '身为',\n",
       " 595: '武打',\n",
       " 596: '高标准',\n",
       " 597: '枪战',\n",
       " 598: '为',\n",
       " 599: '点赞',\n",
       " 600: '热血男儿',\n",
       " 601: '荷尔蒙',\n",
       " 602: '爆发',\n",
       " 603: '给',\n",
       " 604: '星',\n",
       " 605: '血战',\n",
       " 606: '钢锯',\n",
       " 607: '岭',\n",
       " 608: '会',\n",
       " 609: '歌颂',\n",
       " 610: '宗教',\n",
       " 611: '情怀',\n",
       " 612: '超越',\n",
       " 613: '政权',\n",
       " 614: '当',\n",
       " 615: '只',\n",
       " 616: '明显',\n",
       " 617: '低',\n",
       " 618: '层次',\n",
       " 619: '充满',\n",
       " 620: '现实',\n",
       " 621: '乃至',\n",
       " 622: '投机',\n",
       " 623: '考量',\n",
       " 624: '高下',\n",
       " 625: '立',\n",
       " 626: '见',\n",
       " 627: '请问',\n",
       " 628: '脑',\n",
       " 629: '残',\n",
       " 630: '火箭炮',\n",
       " 631: '吗',\n",
       " 632: '傲气',\n",
       " 633: '雄鹰',\n",
       " 634: '第一',\n",
       " 635: '滴血',\n",
       " 636: '算是',\n",
       " 637: '国内',\n",
       " 638: '准',\n",
       " 639: '钱',\n",
       " 640: '花',\n",
       " 641: '有效',\n",
       " 642: '气魄',\n",
       " 643: '创作',\n",
       " 644: '足够',\n",
       " 645: '真诚',\n",
       " 646: '人物',\n",
       " 647: '连',\n",
       " 648: '可爱',\n",
       " 649: '如果',\n",
       " 650: '当年',\n",
       " 651: '那样',\n",
       " 652: '膨胀',\n",
       " 653: '银幕',\n",
       " 654: '独占',\n",
       " 655: '聚光灯',\n",
       " 656: '走',\n",
       " 657: '扪心自问',\n",
       " 658: '没法',\n",
       " 659: '评价',\n",
       " 660: '全片',\n",
       " 661: '靠',\n",
       " 662: '文戏',\n",
       " 663: '扯淡',\n",
       " 664: '女主角',\n",
       " 665: '毫无',\n",
       " 666: '必要',\n",
       " 667: '只要',\n",
       " 668: '开挂',\n",
       " 669: '牛',\n",
       " 670: '逼',\n",
       " 671: '之处',\n",
       " 672: '在于',\n",
       " 673: '透露',\n",
       " 674: '极',\n",
       " 675: '强烈',\n",
       " 676: '意识形态',\n",
       " 677: '枷锁',\n",
       " 678: '祖国',\n",
       " 679: '面前',\n",
       " 680: '一切',\n",
       " 681: '反动派',\n",
       " 682: '纸老虎',\n",
       " 683: '人开',\n",
       " 684: '挂',\n",
       " 685: '团灭',\n",
       " 686: '合情合理',\n",
       " 687: '两星',\n",
       " 688: '鼓励',\n",
       " 689: '其他',\n",
       " 690: '般',\n",
       " 691: '看点',\n",
       " 692: '有点',\n",
       " 693: '手接',\n",
       " 694: '哈哈哈',\n",
       " 695: '从',\n",
       " 696: '之后',\n",
       " 697: '炸',\n",
       " 698: '翻',\n",
       " 699: '一下',\n",
       " 700: '四星',\n",
       " 701: '当时',\n",
       " 702: '其实',\n",
       " 703: '完成度',\n",
       " 704: '接近',\n",
       " 705: '每个',\n",
       " 706: '步骤',\n",
       " 707: '顺滑',\n",
       " 708: '任何',\n",
       " 709: '出人意料',\n",
       " 710: '是因为',\n",
       " 711: '看看',\n",
       " 712: '最近',\n",
       " 713: '世界',\n",
       " 714: '抱歉',\n",
       " 715: '影院',\n",
       " 716: '起来',\n",
       " 717: '魔幻',\n",
       " 718: '当然',\n",
       " 719: '强拆',\n",
       " 720: '现实感',\n",
       " 721: '一幕',\n",
       " 722: '开场',\n",
       " 723: '搏斗',\n",
       " 724: '从来',\n",
       " 725: '其它',\n",
       " 726: '拍摄',\n",
       " 727: '难度',\n",
       " 728: '同时',\n",
       " 729: '技能',\n",
       " 730: '方面',\n",
       " 731: '要求',\n",
       " 732: '回来',\n",
       " 733: '搜',\n",
       " 734: '游泳',\n",
       " 735: '潜水',\n",
       " 736: '滑雪',\n",
       " 737: '飞机',\n",
       " 738: '射击',\n",
       " 739: '各项',\n",
       " 740: '特意',\n",
       " 741: '特种部队',\n",
       " 742: '当过',\n",
       " 743: '月',\n",
       " 744: '兵',\n",
       " 745: '佩服',\n",
       " 746: '这样',\n",
       " 747: '星半',\n",
       " 748: '结束',\n",
       " 749: '掌声',\n",
       " 750: '出现',\n",
       " 751: '近期',\n",
       " 752: '少见',\n",
       " 753: '一粒',\n",
       " 754: '大补丸',\n",
       " 755: '有人',\n",
       " 756: '吃',\n",
       " 757: '开心',\n",
       " 758: '补大',\n",
       " 759: '从白',\n",
       " 760: '黑',\n",
       " 761: '字幕',\n",
       " 762: '展现',\n",
       " 763: '超级',\n",
       " 764: '糙',\n",
       " 765: '猛',\n",
       " 766: '媲美',\n",
       " 767: '终结者',\n",
       " 768: '无',\n",
       " 769: '亮点',\n",
       " 770: '变',\n",
       " 771: '谐星',\n",
       " 772: '掌控',\n",
       " 773: '逼近',\n",
       " 774: '不住',\n",
       " 775: '边缘',\n",
       " 776: '带',\n",
       " 777: '感',\n",
       " 778: '拳拳',\n",
       " 779: '肉',\n",
       " 780: '超爽',\n",
       " 781: '聪明',\n",
       " 782: '鸡',\n",
       " 783: '贼',\n",
       " 784: '一面',\n",
       " 785: '旗下',\n",
       " 786: '呈现',\n",
       " 787: '一出',\n",
       " 788: '重工业',\n",
       " 789: '娱乐',\n",
       " 790: '调控',\n",
       " 791: '说教',\n",
       " 792: '比例',\n",
       " 793: '尺度',\n",
       " 794: '大众',\n",
       " 795: '接纳',\n",
       " 796: '把握',\n",
       " 797: '微妙',\n",
       " 798: '其中',\n",
       " 799: '一些',\n",
       " 800: '奇侠',\n",
       " 801: '化',\n",
       " 802: '内容',\n",
       " 803: '比如',\n",
       " 804: '玻璃',\n",
       " 805: '碴',\n",
       " 806: '子',\n",
       " 807: '飞镖',\n",
       " 808: '杀敌',\n",
       " 809: '一类',\n",
       " 810: '只不过',\n",
       " 811: '遮盖',\n",
       " 812: '掉',\n",
       " 813: '老爹',\n",
       " 814: '演过',\n",
       " 815: '美剧',\n",
       " 816: '搏击',\n",
       " 817: '王国',\n",
       " 818: '力荐',\n",
       " 819: '那部',\n",
       " 820: '为啥',\n",
       " 821: '奇异',\n",
       " 822: '恩典',\n",
       " 823: '配乐',\n",
       " 824: '画',\n",
       " 825: '内',\n",
       " 826: '男生',\n",
       " 827: '的话',\n",
       " 828: '应该',\n",
       " 829: '刺激',\n",
       " 830: '肾上腺素',\n",
       " 831: '女生',\n",
       " 832: '对龙',\n",
       " 833: '小云',\n",
       " 834: '感情',\n",
       " 835: '十分',\n",
       " 836: '打动',\n",
       " 837: '模仿',\n",
       " 838: '许多',\n",
       " 839: '怎么',\n",
       " 840: '玩',\n",
       " 841: '一股脑',\n",
       " 842: '堆',\n",
       " 843: '槽',\n",
       " 844: '几位',\n",
       " 845: '血',\n",
       " 846: '厚到',\n",
       " 847: '科幻',\n",
       " 848: '级别',\n",
       " 849: '重复',\n",
       " 850: '满血',\n",
       " 851: '红血',\n",
       " 852: '中毒',\n",
       " 853: '极速',\n",
       " 854: '回血',\n",
       " 855: '爆',\n",
       " 856: '种',\n",
       " 857: '打通',\n",
       " 858: '全场',\n",
       " 859: '太过',\n",
       " 860: '投机取巧',\n",
       " 861: '穿',\n",
       " 862: '迈克尔',\n",
       " 863: '贝都',\n",
       " 864: '不受',\n",
       " 865: '待见',\n",
       " 866: '国片',\n",
       " 867: '前仆后继',\n",
       " 868: '爆炸',\n",
       " 869: 'high',\n",
       " 870: '瞎',\n",
       " 871: '没用',\n",
       " 872: '女人',\n",
       " 873: '缺',\n",
       " 874: '男人',\n",
       " 875: '征服',\n",
       " 876: '美国',\n",
       " 877: '不行',\n",
       " 878: '全都',\n",
       " 879: '跟',\n",
       " 880: '跳墙',\n",
       " 881: '一样',\n",
       " 882: '拯救',\n",
       " 883: '国产片',\n",
       " 884: '以',\n",
       " 885: '中印',\n",
       " 886: '局势',\n",
       " 887: '对比',\n",
       " 888: '假想',\n",
       " 889: '真是',\n",
       " 890: '讽刺',\n",
       " 891: '谄媚',\n",
       " 892: '军旅',\n",
       " 893: '题材',\n",
       " 894: '质感',\n",
       " 895: '国外',\n",
       " 896: '精彩',\n",
       " 897: '看着',\n",
       " 898: '有力',\n",
       " 899: '必须',\n",
       " 900: '安利',\n",
       " 901: '张',\n",
       " 902: '翰',\n",
       " 903: '简直',\n",
       " 904: '承包',\n",
       " 905: '笑点',\n",
       " 906: '量身定做',\n",
       " 907: '彭于',\n",
       " 908: '晏',\n",
       " 909: '可演',\n",
       " 910: '不来',\n",
       " 911: '不少',\n",
       " 912: '漂移',\n",
       " 913: '无人机',\n",
       " 914: '突袭',\n",
       " 915: '直升机',\n",
       " 916: '坠',\n",
       " 917: '露',\n",
       " 918: '肉搏',\n",
       " 919: '军舰',\n",
       " 920: '发射',\n",
       " 921: '叛乱',\n",
       " 922: '国际化',\n",
       " 923: '视角',\n",
       " 924: '标配',\n",
       " 925: '饰演',\n",
       " 926: '深入人心',\n",
       " 927: '搏命',\n",
       " 928: '精神',\n",
       " 929: '当下',\n",
       " 930: '第三部',\n",
       " 931: '表白',\n",
       " 932: '典型',\n",
       " 933: '方式',\n",
       " 934: '每次',\n",
       " 935: '猜',\n",
       " 936: '诶',\n",
       " 937: '问',\n",
       " 938: '王牌',\n",
       " 939: '特工',\n",
       " 940: '那么',\n",
       " 941: '杀人',\n",
       " 942: '经过',\n",
       " 943: '艺术',\n",
       " 944: '处理',\n",
       " 945: '直接',\n",
       " 946: '删',\n",
       " 947: '血腥',\n",
       " 948: '屠杀',\n",
       " 949: '赤裸裸',\n",
       " 950: '大段',\n",
       " 951: '正确',\n",
       " 952: '庇',\n",
       " 953: '衣',\n",
       " 954: '意料之中',\n",
       " 955: '意料之外',\n",
       " 956: '惊喜',\n",
       " 957: '属于',\n",
       " 958: '狼性',\n",
       " 959: '军魂',\n",
       " 960: '几个',\n",
       " 961: '网红',\n",
       " 962: '弹弹琴',\n",
       " 963: '大国',\n",
       " 964: '气象',\n",
       " 965: '满屏',\n",
       " 966: '告诉',\n",
       " 967: '吴',\n",
       " 968: '迪塞尔',\n",
       " 969: '如入无人之境',\n",
       " 970: '亿',\n",
       " 971: '大陆',\n",
       " 972: '一刻',\n",
       " 973: '集体',\n",
       " 974: '勃起',\n",
       " 975: '离开',\n",
       " 976: '影厅',\n",
       " 977: '屌丝',\n",
       " 978: '同样',\n",
       " 979: '开始',\n",
       " 980: '前',\n",
       " 981: '屌',\n",
       " 982: '一万倍',\n",
       " 983: '一次',\n",
       " 984: '标准',\n",
       " 985: '打造',\n",
       " 986: '美式',\n",
       " 987: '不可逆转',\n",
       " 988: '缺点',\n",
       " 989: '笑料',\n",
       " 990: '一定',\n",
       " 991: '程度',\n",
       " 992: '地',\n",
       " 993: '破坏',\n",
       " 994: '节奏感',\n",
       " 995: '斥',\n",
       " 996: '巨资',\n",
       " 997: '炮制',\n",
       " 998: '有所',\n",
       " 999: '体验',\n",
       " ...}"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "index2word = {ind:w for w,ind in word2index.items()}\n",
    "index2word"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 将文本转化成向量"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "def sent2vec(sent):\n",
    "    words = sent.strip().split()\n",
    "    return [word2index[w] for w in words]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'吴京 的 炒作 水平 不输 冯小刚 但 小刚 至少 不会 用 主旋律 来 炒作 吴京 让 人 看 了 不 舒服 为了 主旋律 而 主旋律 为了 煽情 而 煽情 让 人 觉得 他 是 个 大 做作 大 谎言 家 更新 片子 整体 不如 湄公河 行动 整体 不够 流畅 编剧 有毒 台词 尴尬 刻意 做作 的 主旋律 煽情 显得 如此 不合时宜 而 又 多余'"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "comments['cleaned_comment'].iloc[2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[1,\n",
       " 6,\n",
       " 43,\n",
       " 44,\n",
       " 45,\n",
       " 46,\n",
       " 47,\n",
       " 48,\n",
       " 49,\n",
       " 50,\n",
       " 51,\n",
       " 52,\n",
       " 53,\n",
       " 43,\n",
       " 1,\n",
       " 54,\n",
       " 55,\n",
       " 8,\n",
       " 4,\n",
       " 56,\n",
       " 57,\n",
       " 58,\n",
       " 52,\n",
       " 59,\n",
       " 52,\n",
       " 58,\n",
       " 60,\n",
       " 59,\n",
       " 60,\n",
       " 54,\n",
       " 55,\n",
       " 61,\n",
       " 22,\n",
       " 62,\n",
       " 63,\n",
       " 64,\n",
       " 65,\n",
       " 64,\n",
       " 66,\n",
       " 67,\n",
       " 68,\n",
       " 69,\n",
       " 70,\n",
       " 71,\n",
       " 72,\n",
       " 73,\n",
       " 70,\n",
       " 74,\n",
       " 75,\n",
       " 76,\n",
       " 77,\n",
       " 78,\n",
       " 79,\n",
       " 80,\n",
       " 65,\n",
       " 6,\n",
       " 52,\n",
       " 60,\n",
       " 81,\n",
       " 82,\n",
       " 83,\n",
       " 59,\n",
       " 84,\n",
       " 85]"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sent2vec(comments['cleaned_comment'].iloc[2])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 将句子的向量转化成相同长度"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = [sent2vec(sent) for sent in comments['cleaned_comment']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "maxlen=100\n",
    "X = sequence.pad_sequences(X, maxlen=maxlen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[    0,     0,     0, ...,     9,    10,    11],\n",
       "       [    0,     0,     0, ...,    40,    41,    42],\n",
       "       [    0,     0,     0, ...,    59,    84,    85],\n",
       "       ...,\n",
       "       [    0,     0,     0, ...,    56,   828,   339],\n",
       "       [    0,     0,     0, ...,   429,     8,   446],\n",
       "       [    0,     0,     0, ..., 11072,  6306,     9]], dtype=int32)"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(248396, 100)"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0         1\n",
       "1         2\n",
       "2         2\n",
       "3         4\n",
       "4         1\n",
       "         ..\n",
       "261492    3\n",
       "261493    3\n",
       "261494    2\n",
       "261495    3\n",
       "261496    3\n",
       "Name: star, Length: 248396, dtype: int64"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y = comments['star']\n",
    "y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "def to_categorical(y):\n",
    "    y = np.array(y, dtype='int')\n",
    "    n = len(y)\n",
    "    m = np.max(y)\n",
    "    categorical = np.zeros((n, m))\n",
    "    categorical[np.arange(n), y-1] = 1\n",
    "    return categorical"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "y = to_categorical(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[1., 0., 0., 0., 0.],\n",
       "       [0., 1., 0., 0., 0.],\n",
       "       [0., 1., 0., 0., 0.],\n",
       "       ...,\n",
       "       [0., 1., 0., 0., 0.],\n",
       "       [0., 0., 1., 0., 0.],\n",
       "       [0., 0., 1., 0., 0.]])"
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 嵌入矩阵"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,\n",
       "         0.        ,  0.        ],\n",
       "       [ 2.12430072, -0.39485088, -1.34238815, ...,  2.30412221,\n",
       "         1.03823328, -1.30636704],\n",
       "       [ 0.26209503, -0.49268979, -0.51753461, ...,  1.32445359,\n",
       "        -0.33967397,  0.66318768],\n",
       "       ...,\n",
       "       [-0.26862964, -0.23968956,  0.16119906, ...,  0.01158467,\n",
       "         0.16363347,  0.01866432],\n",
       "       [ 0.30251694,  0.03766752,  0.73150182, ...,  0.35207793,\n",
       "        -0.23663767,  0.10812829],\n",
       "       [-0.04078084, -0.02281673, -0.10331304, ...,  0.08166362,\n",
       "        -0.16539493, -0.12828492]])"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "embed_size = 50\n",
    "nb_words = len(vocab)+1\n",
    "embedding_matrix = np.zeros((nb_words, embed_size))\n",
    "for word, i in word2index.items():\n",
    "    if word in wv:\n",
    "        vector = wv[word]\n",
    "        embedding_matrix[i] = vector\n",
    "\n",
    "embedding_matrix    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(98553, 50)"
      ]
     },
     "execution_count": 66,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "embedding_matrix.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 创建模型"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 基准模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "inp = Input(shape=(maxlen, ))\n",
    "x = Embedding(nb_words, embed_size, weights=[embedding_matrix])(inp)\n",
    "x = Bidirectional(\n",
    "    LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)\n",
    "x = GlobalMaxPool1D()(x)\n",
    "x = Dense(50, activation=\"relu\")(x)\n",
    "x = Dropout(0.1)(x)\n",
    "x = Dense(5, activation='softmax')(x)\n",
    "model = Model(inputs=inp, outputs=x)\n",
    "model.compile(loss='categorical_crossentropy',\n",
    "              optimizer='adam',\n",
    "              metrics=['accuracy'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 186297 samples, validate on 62099 samples\n",
      "Epoch 1/2\n",
      "186297/186297 [==============================] - 152s 814us/sample - loss: 1.3473 - accuracy: 0.3875 - val_loss: 1.2838 - val_accuracy: 0.4221\n",
      "Epoch 2/2\n",
      "186297/186297 [==============================] - 150s 804us/sample - loss: 1.2257 - accuracy: 0.4502 - val_loss: 1.2622 - val_accuracy: 0.4317\n"
     ]
    }
   ],
   "source": [
    "history = model.fit(X, y, epochs=2, batch_size=128, validation_split=0.25)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 五分类转化成三分类"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "def to3cat(x):\n",
    "    if x < 3:\n",
    "        x = 1  # 烂片\n",
    "    elif x == 3:\n",
    "        x = 2  # 普通\n",
    "    else:\n",
    "        x = 3  # 好片\n",
    "    return x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [],
   "source": [
    "y = comments['star'].apply(to3cat)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       ...,\n",
       "       [1., 0., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.]])"
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y = to_categorical(y)\n",
    "y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
    "inp = Input(shape=(maxlen, ))\n",
    "x = Embedding(nb_words, embed_size, weights=[embedding_matrix])(inp)\n",
    "x = Bidirectional(\n",
    "    LSTM(200, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)\n",
    "x = GlobalMaxPool1D()(x)\n",
    "x = Dense(50, activation=\"relu\")(x)\n",
    "x = Dropout(0.1)(x)\n",
    "x = Dense(3, activation='softmax')(x)\n",
    "model = Model(inputs=inp, outputs=x)\n",
    "model.compile(loss='categorical_crossentropy',\n",
    "              optimizer='adam',\n",
    "              metrics=['accuracy'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 186297 samples, validate on 62099 samples\n",
      "Epoch 1/2\n",
      "186297/186297 [==============================] - 156s 840us/sample - loss: 0.8209 - accuracy: 0.6331 - val_loss: 0.8541 - val_accuracy: 0.6168\n",
      "Epoch 2/2\n",
      "186297/186297 [==============================] - 155s 831us/sample - loss: 0.7184 - accuracy: 0.6872 - val_loss: 0.7939 - val_accuracy: 0.6434\n"
     ]
    }
   ],
   "source": [
    "history = model.fit(X, y, epochs=2, batch_size=128, validation_split=0.25)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 二分类"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [],
   "source": [
    "def to2cat(x):\n",
    "    if x <= 3:\n",
    "        x = 0  # neg\n",
    "    else:\n",
    "        x = 1  # pos\n",
    "    return x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0         0\n",
       "1         0\n",
       "2         0\n",
       "3         1\n",
       "4         0\n",
       "         ..\n",
       "261492    0\n",
       "261493    0\n",
       "261494    0\n",
       "261495    0\n",
       "261496    0\n",
       "Name: star, Length: 248396, dtype: int64"
      ]
     },
     "execution_count": 77,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y = comments['star'].apply(to2cat)\n",
    "y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [],
   "source": [
    "inp = Input(shape=(maxlen, ))\n",
    "x = Embedding(nb_words, embed_size, weights=[embedding_matrix])(inp)\n",
    "x = Bidirectional(\n",
    "    LSTM(200, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)\n",
    "x = GlobalMaxPool1D()(x)\n",
    "x = Dense(50, activation=\"relu\")(x)\n",
    "x = Dropout(0.1)(x)\n",
    "x = Dense(1, activation='softmax')(x)\n",
    "model = Model(inputs=inp, outputs=x)\n",
    "model.compile(loss='binary_crossentropy',\n",
    "              optimizer='adam',\n",
    "              metrics=['accuracy'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "history = model.fit(X, y, epochs=2, batch_size=128, validation_split=0.25)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": true
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
